In [13]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords

import re
from bs4 import BeautifulSoup 

from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
data = pd.read_csv("labeled_data.csv",encoding="latin-1")

In [16]:
def tweet_to_words(raw_tweet):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_tweet).get_text() 
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 3. Convert to lower case, split into individual words
    words = review_text.lower().split()   
    #words = letters_only.lower().split()            
    # 4. In Python, searching a set is much faster than searching
    stops = set(stopwords.words("english")) 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]  
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

In [17]:
num_tweets = data["tweet"].size
clean_train_tweets = []
for i in range( 0, num_tweets ):
    if( (i+1)%10000 == 0 ):
        print ("Tweet %d of %d\n" % ( i+1, num_tweets ))                                                                    
    clean_train_tweets.append( tweet_to_words( data.iloc[i]["tweet"]))

Tweet 10000 of 121054

Tweet 20000 of 121054

Tweet 30000 of 121054

Tweet 40000 of 121054

Tweet 50000 of 121054

Tweet 60000 of 121054



  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Tweet 70000 of 121054

Tweet 80000 of 121054

Tweet 90000 of 121054

Tweet 100000 of 121054

Tweet 110000 of 121054

Tweet 120000 of 121054



In [18]:
#initialize the count vectorizer
vectorizer = TfidfVectorizer(analyzer = "word",   \
                             ngram_range=(1, 3),  \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 300,  \
                             max_df = 0.85) 
train_data_features = vectorizer.fit_transform(clean_train_tweets)

In [7]:
train_data_features = train_data_features.toarray()
X = pd.DataFrame(train_data_features)
y = data['class'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=69, test_size=0.33)

Logistic Regression

In [15]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', LogisticRegression(class_weight='balanced',penalty='l2'))])

param_grid = [{}]
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   1.3s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ................................................. , total=   1.3s
[CV]  ................................................................
[CV] ................................................. , total=   1.1s
[CV]  ................................................................
[CV] ................................................. , total=   1.1s
[CV]  ................................................................
[CV] ................................................. , total=   1.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.7s finished


In [16]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.85      0.61      0.71      8186
           1       0.95      0.77      0.85     14805
           2       0.74      0.97      0.84     16957

   micro avg       0.82      0.82      0.82     39948
   macro avg       0.85      0.78      0.80     39948
weighted avg       0.84      0.82      0.82     39948



Random Forest

In [19]:
pipe = Pipeline(
        [('select', SelectFromModel(LogisticRegression(class_weight='balanced',
                                                  penalty="l1", C=0.01))),
        ('model', RandomForestClassifier(n_estimators=300, random_state=0))])

param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  41.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.8s remaining:    0.0s


[CV] ................................................. , total=  44.4s
[CV]  ................................................................
[CV] ................................................. , total=  40.2s
[CV]  ................................................................
[CV] ................................................. , total=  40.7s
[CV]  ................................................................
[CV] ................................................. , total=  39.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.3min finished


In [20]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.84      0.63      0.72      8186
           1       0.93      0.80      0.86     14805
           2       0.76      0.94      0.84     16957

   micro avg       0.83      0.83      0.83     39948
   macro avg       0.84      0.79      0.81     39948
weighted avg       0.84      0.83      0.82     39948



Linear SVC

In [22]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', LinearSVC(C=0.05,random_state=0))])

param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   1.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV] ................................................. , total=   0.9s
[CV]  ................................................................
[CV] ................................................. , total=   0.9s
[CV]  ................................................................
[CV] ................................................. , total=   1.1s
[CV]  ................................................................
[CV] ................................................. , total=   1.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.1s finished


In [23]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.91      0.58      0.71      8186
           1       0.95      0.77      0.85     14805
           2       0.74      0.98      0.84     16957

   micro avg       0.82      0.82      0.82     39948
   macro avg       0.86      0.78      0.80     39948
weighted avg       0.85      0.82      0.82     39948



Extra Trees

In [31]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', ExtraTreeClassifier())])

param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   0.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV] ................................................. , total=   0.8s
[CV]  ................................................................
[CV] ................................................. , total=   0.9s
[CV]  ................................................................
[CV] ................................................. , total=   0.7s
[CV]  ................................................................
[CV] ................................................. , total=   0.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.4s finished


In [32]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.82      0.64      0.71      8186
           1       0.93      0.80      0.86     14805
           2       0.76      0.93      0.84     16957

   micro avg       0.82      0.82      0.82     39948
   macro avg       0.84      0.79      0.81     39948
weighted avg       0.84      0.82      0.82     39948



Naive Bayes

In [35]:
pipe = Pipeline([('select', SelectFromModel(LogisticRegression(class_weight='balanced', penalty="l1", C=0.01))),
                 ('model', BernoulliNB())])

param_grid = [{}] 
grid_search = GridSearchCV(pipe, param_grid,cv=StratifiedKFold(n_splits=5, random_state=69).split(X_train, y_train), verbose=2)
model = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   0.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ................................................. , total=   0.8s
[CV]  ................................................................
[CV] ................................................. , total=   0.8s
[CV]  ................................................................
[CV] ................................................. , total=   0.8s
[CV]  ................................................................
[CV] ................................................. , total=   0.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.1s finished


In [36]:
y_preds = model.predict(X_test)
report = classification_report( y_test, y_preds )
print(report)

              precision    recall  f1-score   support

           0       0.88      0.60      0.71      8186
           1       0.87      0.79      0.83     14805
           2       0.75      0.93      0.83     16957

   micro avg       0.81      0.81      0.81     39948
   macro avg       0.83      0.77      0.79     39948
weighted avg       0.82      0.81      0.80     39948

