In [185]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import nltk

In [2]:
profiles = pd.read_csv('./ready_for_cvec.csv', dtype=object, index_col= 0)

In [3]:
profiles.shape

(33653, 21)

In [4]:
profiles = profiles.dropna()

In [5]:
profiles.shape

(23340, 21)

In [6]:
profiles.sentiment_dummies.value_counts()

1     17538
-1     5802
Name: sentiment_dummies, dtype: int64

## Building first model

In [170]:
#tested all of the profile 'clean' levels, i.e. with different components removed/included
X = profiles.without_hashes
y = profiles.sentiment_dummies

In [171]:
tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True,strip_handles=True)

In [172]:
classifier = LogisticRegression()

In [173]:
cvec = CountVectorizer(tokenizer= tokenizer.tokenize)

In [174]:
X = cvec.fit_transform(X)

In [175]:
cross_val_score(classifier, X, y)

array([0.71966581, 0.71863753, 0.71940874])

In [176]:
#underpredicting negative sentiment from profiles
X_train, X_test, y_train, y_test = train_test_split(X, y)
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
print(pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True), '\n')

Predicted   -1     1   All
Actual                    
-1         113  1342  1455
1          260  4120  4380
All        373  5462  5835 



## Testing different models against baseline
## and logistic regression before hyperparameter tuning

In [178]:
#basic model testing function
def model_tester(X, y, model, return_vals=False):
    """Prints basic classification matrix and accuracy for a given model"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        stratify=profiles['sentiment_dummies'], test_size=0.3)
    modeled = model()
    modeled.fit(X_train, y_train)
    predictions = modeled.predict(X_test)
    print(pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True), '\n')
    print(accuracy_score(y_test, predictions))
    if return_vals:
        return X_train, X_test, y_train, y_test

In [180]:
#Baseline- if I predict 1 everytime, I have 75% accuracy
print(profiles['sentiment_dummies'].value_counts(), '\n')
print('BASELINE')
print(17548/profiles.shape[0])

1     17538
-1     5802
Name: sentiment_dummies, dtype: int64 

BASELINE
0.7518423307626393


In [186]:
model_tester(X, y, LogisticRegression)

Predicted   -1     1   All
Actual                    
-1         145  1596  1741
1          331  4930  5261
All        476  6526  7002 

0.7247929163096258


In [187]:
model_tester(X, y, BernoulliNB)

Predicted   -1     1   All
Actual                    
-1         102  1639  1741
1          262  4999  5261
All        364  6638  7002 

0.7285061411025421


In [184]:
model_tester(X, y, RandomForestClassifier)

Predicted    -1     1   All
Actual                     
-1          317  1424  1741
1           850  4411  5261
All        1167  5835  7002 

0.675235646958012


In [188]:
bern = BernoulliNB()

In [224]:
param_grid = {'alpha': [ 1.1, 1.2, 1.3, 1.9, 2.2, 2.4], 'fit_prior':[True, False], 
             }
clf = GridSearchCV(bern, param_grid=param_grid)
clf.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1.1, 1.2, 1.3, 1.9, 2.2, 2.4], 'fit_prior': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [225]:
clf.best_score_

0.7491431019708654

In [226]:
clf.best_estimator_

BernoulliNB(alpha=2.4, binarize=0.0, class_prior=None, fit_prior=True)

In [227]:
print(len(clf.predict(X_test)))
print(np.sum([int(x) for x in list(clf.predict(X_test))]))

5835
5705


## now performing basic test with tfidf

In [180]:
tfidf = TfidfTransformer().fit(df)

In [182]:
df_tfidf = tfidf.transform(df)
ss = StandardScaler()
df_tfidf = ss.fit_transform(df_tfidf.toarray())

In [183]:
#marginally better than just the countvectorizer 
model_tester(df_tfidf, y, MultinomialNB)

Predicted   -1     1   All
Actual                    
-1         207  1537  1744
1          566  4692  5258
All        773  6229  7002 

0.6996572407883462


## incorporating additional profile features

In [190]:
#if building this model just on profiles, without pulling a sample of the user's tweets, drop
#weekday, hour
df = profiles[['number_of_people_they_follow',
          'number_of_user_tweets', 'user_followers_count', 'user_is_verified',
          'weekday', 'hour', 'profile_length', 'percent_in_dictionary', 'number_of_hashes']]

#converting datatypes

for col in list(df.columns):
    try: 
        df[col] = df[col].astype(int)
    except:
        df[col] = df[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [191]:
#standardizing 
df_ss = pd.DataFrame(ss.fit_transform(df), columns=list(df.columns))

In [192]:
Z_merged = pd.concat([Z, df_ss], axis=1)

In [193]:
model_tester(Z_merged, y, MultinomialNB)

Predicted    -1    1   All
Actual                    
-1         1440  203  1643
1          4620  739  5359
All        6060  942  7002 

0.3111968009140246


In [200]:
pca = PCA()
pca.fit(df)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [204]:
np.cumsum(pca.explained_variance_ratio_)

array([0.58399678, 0.99342875, 0.99999961, 0.99999999, 1.        ,
       1.        , 1.        , 1.        , 1.        ])

In [205]:
df_transformed = pca.transform(df)

In [208]:
col_headers = ['PC' + str(i) for i in range(1, 10)]
df_transformed = pd.DataFrame(df_transformed, columns=col_headers)
df_transformed = df_transformed.drop(['PC' + str(i) for i in range(3, 10)], axis=1)

In [210]:
Z_merged = pd.concat([Z, df_transformed], axis=1)

In [213]:
model_tester(df, y, MultinomialNB)

Predicted    -1     1   All
Actual                     
-1         1498   251  1749
1          4503   750  5253
All        6001  1001  7002 

0.32105112824907167


## unused LSA dimensionality reduction below

In [None]:
#LOOKING AT LSA (truncatedSVD)

# #Looked at reducing dimensionality, classifying positive more than 98% of time
# lsa = TruncatedSVD(n_components=100)
# lsa.fit(X)
# X = lsa.transform(X)

# #turning PCA into dataframe
# cols = ['PCA' + str(i) for i in range(1, 101)]
# df = pd.DataFrame(X, columns=cols)

# #very high scores, but... see below
# cross_val_score(classifier, df, y)

# #Seeing what was happening- predicted almost entirely positive sentiments
# lr = LogisticRegression()
# X_train, X_test, y_train, y_test = train_test_split(df, y)
# lr.fit(X_train, y_train)
# predictions = lr.predict(X_test)
# print(pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True), '\n')