In [21]:
import sklearn
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn import metrics
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
import numpy as np
import itertools
from sklearn.preprocessing import Normalizer
#import xgboost
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [22]:
data = pd.read_csv("../Data/train_data.csv")
data.columns

Index(['Unnamed: 0', 'target', 'tweets'], dtype='object')

In [23]:
data.tweets.isnull().value_counts()

False    1579422
True       20577
Name: tweets, dtype: int64

In [24]:
data.dropna(axis=0, inplace=True)

In [25]:
data.shape

(1579422, 3)

In [26]:
target = data['target']
data = data.drop(columns=["target", 'Unnamed: 0'], axis=1)
print("{}\n{}".format(data.shape, target.shape))

(1579422, 1)
(1579422,)


In [27]:
vectorizer = TfidfVectorizer()
tweets = vectorizer.fit_transform(data["tweets"])
#print(vectorizer.get_feature_names())
print(tweets.shape)
vect = open("../Pickle/Vectorizer.pickle", "wb")
pickle.dump(vectorizer, vect)
vect.close()

(1579422, 218659)


In [28]:
x_train, x_test, y_train, y_test = train_test_split(tweets, target, test_size=0.333, random_state=42)

In [30]:
Test_accuracy = []
Train_accuracy = []
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(x_train, y_train)
y_hat = neigh.predict(x_test)
Test_accuracy.append(metrics.accuracy_score(y_train, neigh.predict(x_train)))
Train_accuracy.append(metrics.accuracy_score(y_test, y_hat))
plt.plot(Test_accuracy, color="r")
plt.plot(Train_accuracy, color="c")


KeyboardInterrupt: 

In [20]:
LR = LogisticRegression()
LR.fit(x_train,y_train)
yhat = LR.predict(x_test)
print("LogisticRegression")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, LR.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
classifier_f = open("../Pickle/LogisticRegression.pickle", "wb")
pickle.dump(LR, classifier_f)
classifier_f.close()



LogisticRegression
Train set Accuracy:  0.7891253130119965
Test set Accuracy:  0.7667982386091401


In [16]:
BNB = BernoulliNB()
BNB.fit(x_train,y_train)
yhat = BNB.predict(x_test)
print("BernoulliNB")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, BNB.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))


BernoulliNB
Train set Accuracy:  0.7860488251252523
Test set Accuracy:  0.7584457018564573


In [17]:
MNB = MultinomialNB()
MNB.fit(x_train,y_train)
yhat = MNB.predict(x_test)
print("MultinomialNB")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, MNB.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))


MultinomialNB
Train set Accuracy:  0.786088693218817
Test set Accuracy:  0.7520705468981724


In [18]:
'''
GNB = GaussianNB()
GNB.fit(x_train,y_train)
yhat = GNB.predict(x_test)
print(" GaussianNB")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, GNB.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
'''

'\nGNB = GaussianNB()\nGNB.fit(x_train,y_train)\nyhat = GNB.predict(x_test)\nprint(" GaussianNB")\nprint("Train set Accuracy: ", metrics.accuracy_score(y_train, GNB.predict(x_train)))\nprint("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))\n'

In [19]:
"""LSVC = LinearSVC()
LSVC.fit(x_train,y_train)
yhat = LSVC.predict(x_test)
print("LinearSVC")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, LSVC.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
"""

'LSVC = LinearSVC()\nLSVC.fit(x_train,y_train)\nyhat = LSVC.predict(x_test)\nprint("LinearSVC")\nprint("Train set Accuracy: ", metrics.accuracy_score(y_train, LSVC.predict(x_train)))\nprint("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))\n'

In [20]:
"""PSVC = SVC()
PSVC.fit(x_train,y_train)
yhat = PSVC.predict(x_test)
print("SVC")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, PSVC.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
"""

'PSVC = SVC()\nPSVC.fit(x_train,y_train)\nyhat = PSVC.predict(x_test)\nprint("SVC")\nprint("Train set Accuracy: ", metrics.accuracy_score(y_train, PSVC.predict(x_train)))\nprint("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))\n'

In [None]:
RForest = RandomForestClassifier()
RForest.fit(x_train, y_train)
yhat = RForest.predict(x_test)
print("RandomForestClassifier")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, RForest.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))




In [None]:
DTree = DecisionTreeClassifier(max_depth=3)
DTree.fit(x_train, y_train)
yhat = DTree.predict(x_test)
print("DecisionTreeClassifier")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, DTree.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))


In [None]:
ETree = ExtraTreeClassifier(max_depth=3)
ETree.fit(x_train, y_train)
yhat = ETree.predict(x_test)
print("ExtraTreeClassifier")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, ETree.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))


In [None]:
Ada = AdaBoostClassifier()
Ada.fit(x_train, y_train)
yhat = Ada.predict(x_test)
print("AdaBoostClassifier")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, Ada.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))


In [None]:
xgb = xgboost.XGBClassifier()
xgb.fit(x_train, y_train)
yhat = xgb.predict(x_test)
print("xgboost")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, xgb.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))


In [None]:
Gboost = GradientBoostingClassifier()
Gboost.fit(x_train, y_train)
yhat = Gboost.predict(x_test)
print("GradientBoostingClassifier")
print("Train set Accuracy: ", metrics.accuracy_score(y_train, Gboost.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))