In [32]:
import pandas as pd
import numpy as np
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Getting the data

In [2]:
tweets = pd.read_csv('dataset/general-tweets.txt',sep="\t",header=None)
tweets.columns = ['label','tweets']
tweets.head()

Unnamed: 0,label,tweets
0,NOT,Bumping dj sefs mixtape nowww this is my music...
1,NOT,#ieroween THE STORY OF IEROWEEN! THE VIDEO ->>...
2,NOT,trick or treating at the mall today; ZOO! last...
3,NOT,@Ussk81 PMSL!!! I try not to stare but I can't...
4,NOT,@Sc0rpi0n676 btw - is there a remote chance i ...


In [3]:
x_train = tweets['tweets']
y_train = tweets['label']

In [4]:
tweets_test = pd.read_csv('dataset/keyword-tweets.txt',sep="\t",header=None)
tweets_test.columns = ['label','tweets']
tweets_test.head()

Unnamed: 0,label,tweets
0,POLIT,Global Voices Online Â» Alex Castro: A liberal...
1,POLIT,Do the Conservatives Have a Death Wish? http:/...
2,NOT,@MMFlint I've seen all of your movies and Capi...
3,POLIT,RT @AllianceAlert: * House Dems ask for civili...
4,POLIT,RT @AdamSmithInst Quote of the week: My politi...


In [5]:
x_test = tweets_test['tweets']
y_test = tweets_test['label']
print("Train size : {}, Test Size : {}".format(x_train.shape, x_test.shape))

Train size : (2000,), Test Size : (2004,)


In [6]:
# Converting Str labels to int
y_train = np.where(y_train=="NOT",0,1)
y_test = np.where(y_test=="NOT",0,1)

In [7]:
y_train, y_test

(array([0, 0, 0, ..., 0, 0, 1]), array([1, 1, 0, ..., 1, 1, 0]))

### Tokenize

In [15]:
vect = CountVectorizer(stop_words='english')
x_train_ = vect.fit_transform(x_train)
x_train_.shape

(2000, 8807)

In [16]:
x_train_vect = x_train_.toarray()

In [17]:
x_test_vect = vect.transform(x_test).toarray()
x_test_vect.shape

(2004, 8807)

## Build the Model

In [21]:
clf = LogisticRegression()
clf.fit(x_train_vect,y_train)

LogisticRegression()

In [23]:
accuracy_score(y_test,clf.predict(x_test_vect))

0.15718562874251496

### Understanding why it is bad !

In [28]:
tn,fp,fn,tp = confusion_matrix(y_test, clf.predict(x_test_vect)).ravel()
print("True negatives : {}\nFalse positives : {}\nFalse negatives : {}\nTrue positives : {}".format(tn,fp,fn,tp))

True negatives : 313
False positives : 0
False negatives : 1689
True positives : 2


In [31]:
y_test.sum()

1691

In [46]:
y_test.shape

(2004,)

### Performing Regularisation

In [33]:
parameters = {'penalty':['l1','l2'],
             'C':[0.0001,0.001, 0.01, 0.1, 1, 10, 100, 1000]}
lr = LogisticRegression(solver='liblinear')
clf = GridSearchCV(lr, parameters, cv=10)
clf.fit(x_train_vect,y_train)



GridSearchCV(cv=10, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2']})

In [34]:
clf.best_params_

{'C': 10, 'penalty': 'l1'}

In [37]:
clf.best_estimator_

LogisticRegression(C=10, penalty='l1', solver='liblinear')

In [40]:
print("Training Accuracy : {}, Testing Accuracy : {}".format(accuracy_score(y_train,clf.predict(x_train_vect)),accuracy_score(y_test,clf.predict(x_test_vect))))

Training Accuracy : 1.0, Testing Accuracy : 0.2884231536926148


In [50]:
clf.best_estimator_.coef_

array([[0., 0., 0., ..., 0., 0., 0.]])