In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import feature_selection
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 
import re
from tqdm import tqdm

In [2]:
full = pd.read_csv('../complete_data_models.csv', dtype=object, index_col= 0)

In [3]:
#eliminating 20,000 null profiles
full.drop(list(full[(full.user_profile_text.isnull())].index), axis=0, inplace=True)
full.reset_index(drop=True, inplace=True)

In [4]:
no_neutrals = full.copy()
no_neutrals = no_neutrals.loc[(no_neutrals.sentiment_dummies.isin(['1', '-1']))].reset_index(drop=True)
no_neutrals.shape

(67658, 21)

In [5]:
no_neutrals.drop_duplicates(subset='user_name', inplace=True)
no_neutrals.reset_index(drop=True, inplace=True)
no_neutrals.shape

(51500, 21)

In [6]:
no_neutrals.sentiment_dummies.value_counts()
print('Baseline: ',37397 / (37397 + 14103))

Baseline:  0.7261553398058253


In [7]:
#going to clean X 
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
stop.add('would') 

def cleaner(title):
    """ This function accepts a string, tokenizes it, removes stopwords,
    lemmatizes them, and then return the words as a joined string"""
    tokenizer = RegexpTokenizer(r'\w+')
    text = re.sub(r'(https)[^\s]+', ' ', title)
    text = re.sub(r'@[a-zA-Z0-9]+', ' ', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = re.sub(r'\b(rt|RT)', ' ', text)
    text = re.sub(r"[^a-zA-Z]", ' ', text)
    profile = tokenizer.tokenize(text.lower())
    lst = [word for word in profile if word not in stop]
    lemons = [lemmatizer.lemmatize(word) for word in lst]
    short_lemons = [word for word in lemons if len(word) > 2]
    final = ' '.join(short_lemons)
    return final

no_neutrals['user_profile_text'] = no_neutrals.user_profile_text.apply(cleaner)

In [8]:
X = no_neutrals['user_profile_text']
y = no_neutrals['sentiment_dummies']

In [9]:
cvec = CountVectorizer(min_df=2, ngram_range=(1, 2))
cvec.fit(X)
X = cvec.transform(X)

In [10]:
X = pd.DataFrame(X.todense(), columns= cvec.get_feature_names())

In [11]:
tfidf_transformer = TfidfTransformer().fit(X)

In [12]:
tfidf = tfidf_transformer.transform(X)

In [13]:
# X_train, X_test, y_train, y_test = train_test_split(tfidf, y, stratify=y)

In [14]:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=1)
fs.fit(tfidf, y)

SelectPercentile(percentile=1, score_func=<function chi2 at 0x10dc1a6a8>)

In [15]:
word_import = pd.DataFrame(index=X.columns[fs.get_support()], data=fs.scores_[fs.get_support()]).reset_index()
word_import.columns = ['word' ,'score']
word_import.sort_values(by='score', ascending=False).head(15)

Unnamed: 0,word,score
65,bts,17.333113
408,twt,12.55236
114,directioner,8.898642
148,fan account,8.459397
90,colby,8.395904
450,youngest,7.43603
387,stupid,6.670543
74,candle,6.621854
34,badass,6.57972
19,army,6.258173


In [16]:
#selecting feature importance based on whole dataset first, then will do on just training
#testing with 35% most important features
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=35)
fs.fit(tfidf, y)

SelectPercentile(percentile=35, score_func=<function chi2 at 0x10dc1a6a8>)

In [13]:
df_tf = pd.DataFrame(tfidf.toarray())
df_tf.columns = cvec.get_feature_names()

In [18]:
X_35 = df_tf[list(df_tf.columns[fs.get_support()])]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_35, y, stratify=y)

In [20]:
nb = BernoulliNB()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
print('Bernoulli- 35% most important features- with TFIDF')
print(pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True))
print('accuracy: ', accuracy_score(y_test, predictions))

Bernoulli- 35% most important features- with TFIDF
Predicted    -1      1    All
Actual                       
-1          777   2749   3526
1           593   8756   9349
All        1370  11505  12875
accuracy:  0.7404271844660194


In [15]:
#beat the BASELINE! 

In [18]:
#testing what percentage of features to keep
results = []
for i in tqdm(range(15, 40, 5)):
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
    fs.fit(tfidf, y)
    X = df_tf[list(df_tf.columns[fs.get_support()])]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    nb = BernoulliNB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    results.append(accuracy_score(y_test, predictions))

100%|██████████| 5/5 [04:35<00:00, 55.11s/it]


In [19]:
results

[0.7557281553398059,
 0.7529320388349514,
 0.7514563106796116,
 0.747495145631068,
 0.7403495145631068]

In [20]:
#getting smaller values
results_2 = []
for i in tqdm([2, 4, 6, 8, 10]):
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
    chis = fs.fit_transform(tfidf, y)
    X = df_tf[list(df_tf.columns[fs.get_support()])]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    nb = BernoulliNB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    results_2.append(accuracy_score(y_test, predictions))

100%|██████████| 5/5 [02:19<00:00, 27.81s/it]


In [22]:
results_2

[0.7499805825242718,
 0.7631844660194175,
 0.7685436893203883,
 0.7666796116504855,
 0.7626407766990291]

In [23]:
#percentage of features to use (with Bernoulli), for graphics
feat = pd.DataFrame()
feat['percentage'] = [2, 4, 6, 8, 10] + list(range(15, 40, 5))
feat['accuracy_score'] = results_2 + results
feat['number_of_features'] = [45362 * (x/100) for x in [2, 4, 6, 8, 10] + list(range(15, 40, 5))]

In [14]:
def model_tester(X_train, X_test, y_train, y_test, model, **kwargs):
    """Prints basic classification matrix and accuracy for a given model"""
    modeled = model(**kwargs)
    modeled.fit(X_train, y_train)
    predictions = modeled.predict(X_test)
    print(pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True), '\n')
    print('Overall Accuracy', accuracy_score(y_test, predictions))
    try:
        print('Recall: ', pd.crosstab(y_test, predictions).iloc[1, 1] / 
              (pd.crosstab(y_test, predictions).iloc[1, 1] + pd.crosstab(y_test, predictions).iloc[1, 0]))
        print('Precision: ', pd.crosstab(y_test, predictions).iloc[1, 1] / 
                     (pd.crosstab(y_test, predictions).iloc[1, 1] + pd.crosstab(y_test, predictions).iloc[0, 1]))
    except IndexError:
        print('Precision: ', pd.crosstab(y_test, predictions).iloc[1, 0] / 
             (pd.crosstab(y_test, predictions).iloc[1, 0] + pd.crosstab(y_test, predictions).iloc[0, 0]))
        print('Recall: ', 100)

In [15]:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=6)
fs.fit(tfidf, y)

SelectPercentile(percentile=6, score_func=<function chi2 at 0x10bf9a6a8>)

In [16]:
X_6 = df_tf[list(df_tf.columns[fs.get_support()])]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y, stratify=y)

In [33]:
model_tester(X_train, X_test, y_train, y_test, BernoulliNB)

Predicted   -1      1    All
Actual                      
-1         740   2786   3526
1          203   9146   9349
All        943  11932  12875 

Overall Accuracy 0.7678446601941747
Recall:  0.9782864477484223
Precision:  0.7665102246061012


In [20]:
model_tester(X_train, X_test, y_train, y_test, LogisticRegression)

Predicted  -1      1    All
Actual                     
-1         11   3515   3526
1          13   9336   9349
All        24  12851  12875 

Overall Accuracy 0.7259805825242719
Recall:  0.9986094769494064
Precision:  0.7264804295385573


In [35]:
model_tester(X_train, X_test, y_train, y_test, RandomForestClassifier)

Predicted    -1      1    All
Actual                       
-1          720   2806   3526
1           776   8573   9349
All        1496  11379  12875 

Overall Accuracy 0.7217864077669903
Recall:  0.9169964702107177
Precision:  0.7534053959047368


In [36]:
model_tester(X_train, X_test, y_train, y_test, AdaBoostClassifier)

Predicted  -1      1    All
Actual                     
-1         28   3498   3526
1          45   9304   9349
All        73  12802  12875 

Overall Accuracy 0.7248155339805825
Recall:  0.9951866509787143
Precision:  0.7267614435244493


In [21]:
model_tester(X_train, X_test, y_train, y_test, KNeighborsClassifier)

Predicted   -1      1    All
Actual                      
-1         329   3197   3526
1          314   9035   9349
All        643  12232  12875 

Overall Accuracy 0.7273009708737864
Recall:  0.9664135201625842
Precision:  0.7386363636363636


In [24]:
model_tester(X_train, X_test, y_train, y_test, SVC, kernel='linear')

Predicted  -1      1    All
Actual                     
-1         12   3514   3526
1          12   9337   9349
All        24  12851  12875 

Overall Accuracy 0.726135922330097
Recall:  0.9987164402609905
Precision:  0.7265582444945918
