In [1]:
import pandas as pd
import numpy as np  
import re
import nltk 
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv("training.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
df.head()

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [2]:
df.polarity.value_counts()

4    800000
0    800000
Name: polarity, dtype: int64

In [3]:
df1 = df.sample(n=10000)
df1.polarity.value_counts()

0    5006
4    4994
Name: polarity, dtype: int64

In [4]:
features = df1['text'].values
labels = df1['polarity'].values

In [5]:
features.shape

(10000,)

In [6]:
labels

array([0, 4, 0, ..., 0, 0, 4], dtype=int64)

In [7]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [8]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=4)

In [10]:
from sklearn.svm import SVC

In [11]:
s=SVC(kernel='linear',gamma='scale')

In [12]:
s.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
predictions=s.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[700 274]
 [313 713]]
              precision    recall  f1-score   support

           0       0.69      0.72      0.70       974
           4       0.72      0.69      0.71      1026

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000

0.7065


In [16]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [17]:
predictions_r = text_classifier.predict(X_test)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions_r))
print(classification_report(y_test,predictions_r))
print(accuracy_score(y_test, predictions_r))

[[679 295]
 [287 739]]
              precision    recall  f1-score   support

           0       0.70      0.70      0.70       974
           4       0.71      0.72      0.72      1026

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000

0.709


In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
lr=LogisticRegression(C=0.01,solver='liblinear')
lr

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
lr.fit(X_train,y_train)
predictions_l=lr.predict(X_test)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions_l))
print(classification_report(y_test,predictions_l))
print(accuracy_score(y_test, predictions_l))

[[819 155]
 [498 528]]
              precision    recall  f1-score   support

           0       0.62      0.84      0.71       974
           4       0.77      0.51      0.62      1026

    accuracy                           0.67      2000
   macro avg       0.70      0.68      0.67      2000
weighted avg       0.70      0.67      0.67      2000

0.6735


In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
tree=DecisionTreeClassifier()

In [28]:
tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [29]:
predictions_d=tree.predict(X_test)

In [30]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions_d))
print(classification_report(y_test,predictions_d))
print(accuracy_score(y_test, predictions_d))

[[596 378]
 [343 683]]
              precision    recall  f1-score   support

           0       0.63      0.61      0.62       974
           4       0.64      0.67      0.65      1026

    accuracy                           0.64      2000
   macro avg       0.64      0.64      0.64      2000
weighted avg       0.64      0.64      0.64      2000

0.6395


In [33]:
df = pd.read_csv("testing.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
df.head()

Unnamed: 0,polarity,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [34]:
features_test = df1['text'].values
labels_test= df1['polarity'].values

In [35]:
processed_features_test = []

for sentence in range(0, len(features_test)):
    # Remove all the special characters
    processed_feature_test = re.sub(r'\W', ' ', str(features_test[sentence]))

    # remove all single characters
    processed_feature_test= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature_test)

    # Remove single characters from the start
    processed_feature_test = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature_test) 

    # Substituting multiple spaces with single space
    processed_feature_test = re.sub(r'\s+', ' ', processed_feature_test, flags=re.I)

    # Removing prefixed 'b'
    processed_feature_test = re.sub(r'^b\s+', '', processed_feature_test)

    # Converting to Lowercase
    processed_feature_test = processed_feature_test.lower()

    processed_features_test.append(processed_feature_test)

In [36]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features_test = vectorizer.fit_transform(processed_features_test).toarray()

In [37]:
predictions_svm_test=s.predict(processed_features_test)

In [38]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(labels_test,predictions_svm_test))
print(classification_report(labels_test,predictions_svm_test))
print(accuracy_score(labels_test, predictions_svm_test))

[[3889 1117]
 [1052 3942]]
              precision    recall  f1-score   support

           0       0.79      0.78      0.78      5006
           4       0.78      0.79      0.78      4994

    accuracy                           0.78     10000
   macro avg       0.78      0.78      0.78     10000
weighted avg       0.78      0.78      0.78     10000

0.7831


In [39]:
predictions_rf_test=text_classifier.predict(processed_features_test)

In [40]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(labels_test,predictions_rf_test))
print(classification_report(labels_test,predictions_rf_test))
print(accuracy_score(labels_test, predictions_rf_test))

[[4596  410]
 [ 317 4677]]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      5006
           4       0.92      0.94      0.93      4994

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000

0.9273


In [41]:
predictions_lr_test=lr.predict(processed_features_test)

In [42]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(labels_test,predictions_lr_test))
print(classification_report(labels_test,predictions_lr_test))
print(accuracy_score(labels_test, predictions_lr_test))

[[4254  752]
 [2159 2835]]
              precision    recall  f1-score   support

           0       0.66      0.85      0.75      5006
           4       0.79      0.57      0.66      4994

    accuracy                           0.71     10000
   macro avg       0.73      0.71      0.70     10000
weighted avg       0.73      0.71      0.70     10000

0.7089


In [43]:
predictions_tree_test=tree.predict(processed_features_test)

In [44]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(labels_test,predictions_tree_test))
print(classification_report(labels_test,predictions_tree_test))
print(accuracy_score(labels_test, predictions_tree_test))

[[4525  481]
 [ 385 4609]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      5006
           4       0.91      0.92      0.91      4994

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

0.9134
