In [1]:
#Import libraires
import spacy
import gensim
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore")
import pyLDAvis
import pyLDAvis.gensim
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split as tts, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [2]:
#Load and view dataset
df = pd.read_csv('clean_train.csv')
df.head()

Unnamed: 0,sentiment,tokenized_text
0,1,defining language of touch with different dial...
1,1,learning ab doodle all doodle should be light ...
2,2,one of the most in-your-face ex of stealing th...
3,0,this would b pretty awesome if it did n't cras...
4,1,outside the waiting for the


In [3]:
#Drop null values
df = df.dropna()

In [4]:
#Split into X and Y
X = df['tokenized_text']
y = df['sentiment']

In [5]:
#Initialize a tfidf vectorizer
tfidf = TfidfVectorizer()
vector = tfidf.fit_transform(X)
X = vector.toarray()

In [6]:
#ratio to split into training and test set
ratio = int(len(df)*0.7)

In [7]:
#Split into training and test set
X_train = X[:ratio,:]
X_test = X[ratio:,:]
y_train = df['sentiment'].iloc[:ratio]
y_test = df['sentiment'].iloc[ratio:]

In [8]:
#Initialize MultinomialNB model
nb = MultinomialNB()

In [9]:
#Fit model on training data
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
#Make prediction using X_test
y_pred_nb = nb.predict(X_test)

In [11]:
#View accuracy metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print('The accuracy score using MultinomialNB model is {}.'.format(round(accuracy_nb,3)))
print(classification_report(y_test, y_pred_nb))

The accuracy score using MultinomialNB model is 0.645.
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       128
           1       0.64      0.96      0.77      1317
           2       0.69      0.19      0.30       699
           3       0.00      0.00      0.00        34

    accuracy                           0.65      2178
   macro avg       0.33      0.29      0.27      2178
weighted avg       0.61      0.65      0.56      2178



In [12]:
#Initialize LinearSVC
svc = LinearSVC(class_weight = 'balanced', random_state = 42)

In [13]:
#Fit model on training data
svc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
          verbose=0)

In [14]:
#Make prediction using X_test
y_pred_svc = svc.predict(X_test)

In [15]:
#View accuracy metrics
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print('The accuracy score using LinearSVC model is {}.'.format(round(accuracy_svc,3)))
print(classification_report(y_test, y_pred_svc))

The accuracy score using LinearSVC model is 0.648.
              precision    recall  f1-score   support

           0       0.36      0.39      0.37       128
           1       0.73      0.76      0.74      1317
           2       0.56      0.51      0.54       699
           3       0.07      0.06      0.06        34

    accuracy                           0.65      2178
   macro avg       0.43      0.43      0.43      2178
weighted avg       0.64      0.65      0.65      2178



In [26]:
#Load test set
test = pd.read_csv('clean_test.csv')
test.head()

Unnamed: 0,tweet_id,tokenized_text
0,7506,audience q what prototyping tool do you use sk...
1,7992,at send your best amp to ...
2,247,and here 's a pic of you winning your cc cont
3,7688,marissa mayer phone a a cursor of physical loc...
4,3294,is even cooler than i thought


In [27]:
#Drop null values from test
test = test.dropna()

In [28]:
#Vectorize test
vector_test = tfidf.transform(test['tokenized_text'])
X_test = vector.toarray()

In [29]:
#Make prediction using X_test
y_test_svc = svc.predict(X_test)

In [30]:
#Combine ID and validation Pred
ID = pd.DataFrame(test['tweet_id'], columns =['tweet_id'])
sentiment = pd.DataFrame(y_test_svc, columns = ['sentiment'])
submission = ID.join(sentiment)
print(submission)

      tweet_id  sentiment
0         7506          1
1         7992          1
2          247          2
3         7688          0
4         3294          1
5         6125          1
6         6131          1
7         4134          1
8         8206          1
9         8552          2
10        1634          3
11        4256          2
12        4921          2
13        1694          2
14        4193          1
15        7682          1
16        8151          1
17        1274          2
18        5868          1
19        3327          1
20          12          1
21        5839          1
22        5562          2
23        2959          2
24        8940          0
25        1995          1
26         607          1
27        7364          1
28        8167          3
29         681          2
...        ...        ...
1789      3798          2
1790      2587          2
1791      6709          2
1792      1731          2
1793      8690          2
1794       788          2
1795      38

In [31]:
#Save submission
submission.to_csv('submission baseline.csv', index = False)