In [1]:
import numpy as np
import pandas as pd
import scipy
import re
import matplotlib.pyplot as plt
import string
import nltk
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from nltk.corpus import stopwords

#Step 1 Reading the training dataset
train = pd.read_csv("Twitter-sentiment-self-drive-DFE.csv")


train.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,our_id,sentiment_gold,sentiment_gold_reason,text
0,724227031,True,golden,236,,5,0.7579,10001,5\n4,Author is excited about the development of the...,Two places I'd invest all my money if I could:...
1,724227032,True,golden,231,,5,0.8775,10002,5\n4,Author is excited that driverless cars will be...,Awesome! Google driverless cars will help the ...
2,724227033,True,golden,233,,2,0.6805,10003,2\n1,The author is skeptical of the safety and reli...,If Google maps can't keep up with road constru...
3,724227034,True,golden,240,,2,0.882,10004,2\n1,The author is skeptical of the project's value.,Autonomous cars seem way overhyped given the t...
4,724227035,True,golden,240,,3,1.0,10005,3,Author is making an observation without expres...,Just saw Google self-driving car on I-34. It w...


In [2]:
import re
#Step 2= Removing Twitter Handles

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 

# creating new column tidy_tweet & remove twitter handles (@user)
train['clean_text'] = np.vectorize(remove_pattern)(train['text'], "@[\w]*")

train['clean_text'] = train['clean_text'].str.replace('http\S+|www.\S+', '', case=False)

#Step 3= Removing Punctuations, Numbers, and Special Character 
train['clean_text'] = train['clean_text'].str.replace('[^A-Za-z0-9]+', ' ')
                          
# Step 4 remove stopwords
train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

#Step 5 to lowercase

train['clean_text']= train['clean_text'].apply(lambda x: ''.join([w for w in x.lower()]))

train['clean_text'].head(20)

0     places invest money could printing self drivin...
1     awesome google driverless cars will help blind...
2     google maps keep with road construction suppos...
3     autonomous cars seem overhyped given technolog...
4           just google self driving painted green blue
5     will driverless cars eventually replace taxi d...
6          chicago metro expected fully autonomous 2020
7     love infotainment system this thing almost dri...
8     autonomous vehicles could reduce traffic fatal...
9     driverless cars worth risk want highway when s...
10    driverless cars legal florida california michigan
11    audi first carmaker license from nevada test a...
12    audi says first manufacturer world license fro...
13    future buying these audi ready test autonomous...
14    audi test driving their driverless tampa today...
15    audi first automaker california test self driv...
16    audi gets first permit test self driving cars ...
17    audi gets permit test self driving cars ca

In [3]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train['clean_text'], train['sentiment'],test_size = 0.2)

## for transforming the 80% of the train data ##

from sklearn.feature_extraction.text import CountVectorizer as CountVectorizer
count_vect = CountVectorizer(stop_words='english')
x_train_counts = count_vect.fit_transform(x_train)

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
x_train_tfidf = transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

## for transforming the 20% of the train data which is being used for validation ##
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)


from sklearn.ensemble.forest import RandomForestClassifier
model = RandomForestClassifier(n_estimators=980) 
model= model.fit(x_train_tfidf,y_train)

prediction = model.predict(x_test_tfidf)
#print(prediction)

In [4]:
# testing model's accuracy with accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,prediction))

0.6243016759776536


In [5]:
#model.score(x_train_tfidf,y_train)

In [6]:
from sklearn.metrics import cohen_kappa_score
cohen_score = cohen_kappa_score(y_test, prediction)
print(cohen_score)

0.2052355977733853


In [7]:
from sklearn.metrics import confusion_matrix
print('confusion matrix: \n',confusion_matrix(y_test, prediction))

confusion matrix: 
 [[  0   3  17   3   0   0]
 [  0  11 148   8   0   0]
 [  0   3 802  29   5   0]
 [  0   0 193  72  12   0]
 [  0   0  45  46   7   0]
 [  0   0  25   1   0   2]]


In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        23
           2       0.65      0.07      0.12       167
           3       0.65      0.96      0.78       839
           4       0.45      0.26      0.33       277
           5       0.29      0.07      0.11        98
not_relevant       1.00      0.07      0.13        28

   micro avg       0.62      0.62      0.62      1432
   macro avg       0.51      0.24      0.25      1432
weighted avg       0.58      0.62      0.54      1432



  'precision', 'predicted', average, warn_for)


In [9]:
from sklearn.naive_bayes import MultinomialNB

# creates a ML model based on parameters
model2 = MultinomialNB()
model2 = model2.fit(x_train_tfidf,y_train)
prediction2 = model2.predict(x_test_tfidf)
#print(prediction2)

In [10]:
# testing model's accuracy with accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,prediction2))

0.5879888268156425


In [11]:
from sklearn.metrics import cohen_kappa_score
cohen_score = cohen_kappa_score(y_test, prediction2)
print(cohen_score)

0.018056382029157936


In [12]:
#model2.score(x_train_tfidf,y_train)

In [13]:
from sklearn.metrics import confusion_matrix
print('confusion matrix: \n',confusion_matrix(y_test, prediction2))

confusion matrix: 
 [[  0   0  23   0   0   0]
 [  0   0 167   0   0   0]
 [  0   0 834   5   0   0]
 [  0   0 269   8   0   0]
 [  0   0  91   7   0   0]
 [  0   0  28   0   0   0]]


In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction2))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        23
           2       0.00      0.00      0.00       167
           3       0.59      0.99      0.74       839
           4       0.40      0.03      0.05       277
           5       0.00      0.00      0.00        98
not_relevant       0.00      0.00      0.00        28

   micro avg       0.59      0.59      0.59      1432
   macro avg       0.17      0.17      0.13      1432
weighted avg       0.42      0.59      0.44      1432



  'precision', 'predicted', average, warn_for)


In [15]:
#logistic regression

from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model = log_model.fit(x_train_tfidf,y_train)
prediction3 = log_model.predict(x_test_tfidf)

#print(prediction3)



In [16]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, prediction3))

0.6138268156424581


In [17]:
from sklearn.metrics import cohen_kappa_score
cohen_score = cohen_kappa_score(y_test, prediction3)
print(cohen_score)

0.14792446089998312


In [18]:
#log_model.score(x_train_tfidf,y_train)

In [19]:
from sklearn.metrics import confusion_matrix
print('confusion matrix: \n',confusion_matrix(y_test, prediction3))

confusion matrix: 
 [[  0   0  20   3   0   0]
 [  0   1 154  12   0   0]
 [  0   0 819  19   1   0]
 [  0   0 216  55   6   0]
 [  0   0  57  37   4   0]
 [  0   0  27   1   0   0]]


In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction3))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        23
           2       1.00      0.01      0.01       167
           3       0.63      0.98      0.77       839
           4       0.43      0.20      0.27       277
           5       0.36      0.04      0.07        98
not_relevant       0.00      0.00      0.00        28

   micro avg       0.61      0.61      0.61      1432
   macro avg       0.41      0.20      0.19      1432
weighted avg       0.60      0.61      0.51      1432



In [21]:
from sklearn import tree
# creates a ML model based on parameters
modelDTg = tree.DecisionTreeClassifier(criterion='gini') 
modelDTg = modelDTg.fit(x_train_tfidf,y_train)
predictionDTg = modelDTg.predict(x_test_tfidf)
#print(predictionDTg)

In [22]:
print('accuracy score: \n',accuracy_score(y_test, predictionDTg))
print('confusion matrix: \n',confusion_matrix(y_test, predictionDTg))
print('classification report: \n',classification_report(y_test, predictionDTg))

accuracy score: 
 0.5495810055865922
confusion matrix: 
 [[  0   3  15   4   1   0]
 [  2  19 111  31   2   2]
 [  2  42 667 103  15  10]
 [  1  21 148  81  24   2]
 [  0   9  32  41  15   1]
 [  0   1  20   2   0   5]]
classification report: 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00        23
           2       0.20      0.11      0.15       167
           3       0.67      0.79      0.73       839
           4       0.31      0.29      0.30       277
           5       0.26      0.15      0.19        98
not_relevant       0.25      0.18      0.21        28

   micro avg       0.55      0.55      0.55      1432
   macro avg       0.28      0.26      0.26      1432
weighted avg       0.50      0.55      0.52      1432



In [23]:
from sklearn.metrics import cohen_kappa_score
cohen_score = cohen_kappa_score(y_test, predictionDTg)
print(cohen_score)

0.17737578564958745


In [24]:
#modelDTg.score(x_train_tfidf,y_train)

In [25]:
from sklearn import tree
# creates a ML model based on parameters
modelDTe = tree.DecisionTreeClassifier(criterion='entropy') 
modelDTe = modelDTe.fit(x_train_tfidf,y_train)
predictionDTe = modelDTe.predict(x_test_tfidf)
#print(predictionDTe)

In [26]:
print('accuracy score: \n',accuracy_score(y_test, predictionDTe))
print('confusion matrix: \n',confusion_matrix(y_test, predictionDTe))
print('classification report: \n',classification_report(y_test, predictionDTe))

accuracy score: 
 0.5314245810055865
confusion matrix: 
 [[  1   7  12   3   0   0]
 [  2  14 115  32   4   0]
 [  4  47 659  93  22  14]
 [  2  17 156  66  30   6]
 [  0   7  39  32  18   2]
 [  0   0  20   5   0   3]]
classification report: 
               precision    recall  f1-score   support

           1       0.11      0.04      0.06        23
           2       0.15      0.08      0.11       167
           3       0.66      0.79      0.72       839
           4       0.29      0.24      0.26       277
           5       0.24      0.18      0.21        98
not_relevant       0.12      0.11      0.11        28

   micro avg       0.53      0.53      0.53      1432
   macro avg       0.26      0.24      0.24      1432
weighted avg       0.48      0.53      0.50      1432



In [27]:
# K-NN Classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6)
knn= knn.fit(x_train_tfidf,y_train)
predictionKNN = knn.predict(x_test_tfidf)
print(predictionDTe)

print('accuracy score: \n',accuracy_score(y_test, predictionKNN))
print('confusion matrix: \n',confusion_matrix(y_test, predictionKNN))
print('classification report: \n',classification_report(y_test, predictionKNN))

['5' '5' '4' ... '3' '3' 'not_relevant']
accuracy score: 
 0.598463687150838
confusion matrix: 
 [[  1   3  16   3   0   0]
 [  1   9 147   9   1   0]
 [  1   7 798  29   3   1]
 [  0   2 227  38  10   0]
 [  0   0  60  27  11   0]
 [  0   2  26   0   0   0]]
classification report: 
               precision    recall  f1-score   support

           1       0.33      0.04      0.08        23
           2       0.39      0.05      0.09       167
           3       0.63      0.95      0.76       839
           4       0.36      0.14      0.20       277
           5       0.44      0.11      0.18        98
not_relevant       0.00      0.00      0.00        28

   micro avg       0.60      0.60      0.60      1432
   macro avg       0.36      0.22      0.22      1432
weighted avg       0.52      0.60      0.51      1432



In [28]:
from sklearn.metrics import cohen_kappa_score
cohen_score = cohen_kappa_score(y_test, predictionKNN)
print(cohen_score)

0.1295872674917653


In [29]:
#knn.score(x_train_tfidf,y_train)

In [30]:
#SVM
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear'))
clf_output = clf.fit(x_train_tfidf,y_train)
predictionSVM = clf_output.predict(x_test_tfidf)
clf_output.score(x_train_tfidf,y_train)


0.959643605870021

In [31]:
#print(predictionSVM)

In [32]:
print('accuracy score: \n',accuracy_score(y_test, predictionSVM))
print('confusion matrix: \n',confusion_matrix(y_test, predictionSVM))
print('classification report: \n',classification_report(y_test, predictionSVM))

accuracy score: 
 0.5160614525139665
confusion matrix: 
 [[  1   7   5   9   0   1]
 [  7  29  75  36  12   8]
 [ 10  40 584 139  33  33]
 [  4  18 116  96  36   7]
 [  0   2  31  38  25   2]
 [  2   3  15   4   0   4]]
classification report: 
               precision    recall  f1-score   support

           1       0.04      0.04      0.04        23
           2       0.29      0.17      0.22       167
           3       0.71      0.70      0.70       839
           4       0.30      0.35      0.32       277
           5       0.24      0.26      0.25        98
not_relevant       0.07      0.14      0.10        28

   micro avg       0.52      0.52      0.52      1432
   macro avg       0.27      0.28      0.27      1432
weighted avg       0.52      0.52      0.52      1432



In [33]:
from sklearn.metrics import cohen_kappa_score
cohen_score = cohen_kappa_score(y_test, predictionSVM)
print(cohen_score)

0.19931128131850573


In [34]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris = datasets.load_iris()
iris.data.shape, iris.target.shape
((150, 4), (150,))

((150, 4), (150,))

In [35]:
 X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

X_train.shape, y_train.shape
((90, 4), (90,))
X_test.shape, y_test.shape
((60, 4), (60,))

clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test) 

0.9666666666666667

In [36]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=20)
scores   

array([1.        , 1.        , 0.88888889, 1.        , 1.        ,
       1.        , 0.88888889, 0.88888889, 1.        , 1.        ,
       1.        , 0.83333333, 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])