In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, precision_score, recall_score, precision_recall_fscore_support, jaccard_similarity_score
import matplotlib.pyplot as plt

In [2]:
all_tweets = pd.read_csv('postProcessedText.csv')

In [3]:
all_tweets.head()

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,Worry is a down payment on a problem you may n...,0,1,0,0,0,0,1,0,0,0,1
1,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,it also helps that the majority of NFL coach...,1,0,1,0,1,0,1,0,0,0,0
3,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,My roommate it's okay that we can't spell beca...,1,0,1,0,0,0,0,0,0,0,0


In [4]:
X_train = all_tweets.Tweet
y_train = all_tweets.drop(labels = 'Tweet', axis =1)

In [26]:
test_df = pd.read_csv('data/devPostProcessedText.csv')
X_test = test_df.Tweet
y_test = test_df.drop(labels = 'Tweet', axis =1)

In [28]:
y_pred = search.predict(X_test)
test_score = search.score(X_test, y_test)
print('Test Score: ', test_score)

Test Score:  0.3781602708803612


In [88]:
#from sklearn.multiclass import OneVsRestClassifier
pipe2 = Pipeline(
steps=[
('vectorizer', CountVectorizer(lowercase=False, stop_words='english')), 
('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))
])

In [89]:
pipe2.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
 ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [90]:
y_pred = pipe2.predict(X_test)
score = jaccard_similarity_score(y_test, y_pred)

In [92]:
y_pred_train = pipe2.predict(X_train)
train_score = jaccard_similarity_score(y_train, y_pred_train)

In [91]:
score

0.37654251316779536

In [93]:
train_score

0.9946743687237983

In [71]:
def get_forest(pipeline, X, y):
    pipeline.fit(X, y)
    forest = pipeline.named_steps['clf'].estimators_
    return forest

In [72]:
def get_all_features(forest, vocab):
    all_important_features = set()
    for tree in forest:
        f_importance = np.array(tree.feature_importances_)
        # sorted indices
        sort_f_indices = sorted(range(len(f_importance)), key = lambda k: f_importance[k], reverse = True)
        sort_f_importance = f_importance[sort_f_indices]
        # non_zero importance
        non_zero_sort_f_importance = [x for x in sort_f_importance if x]
        non_zero_sort_f_indices = sort_f_indices[:len(non_zero_sort_f_importance)]
        for i in non_zero_sort_f_indices:
            feature = vocab[i]
            all_important_features.add(feature)
    return all_important_features

In [73]:
vectorizer = CountVectorizer(lowercase=False, stop_words='english')
vectorizer.fit(X_train)
vect_tweets = vectorizer.transform(X_train)


In [74]:
def get_vocab(vectorizer):
    return vectorizer.vocabulary_

In [75]:
vocab = get_vocab(vectorizer)

In [76]:
vocab = {i:v for v,i in vocab.items()}

In [77]:
f = get_all_features(forest, vocab)

In [78]:
anger_forest = get_forest(pipe2, X_train, y_train['anger'])

In [79]:
anger_features = get_all_features(anger_forest, vocab)

In [80]:
anger_features

{'Scotland',
 'Thoroughly',
 'interrupt',
 'injury',
 'met',
 'realisation',
 'Hands',
 'steak',
 'Indignation',
 'Deep',
 'relieves',
 'Accidental',
 '2hr',
 'crimewatch',
 'Emotions',
 '_emoj_boom_',
 'induction',
 'ballons',
 'Smile',
 'knee',
 'cantconacon',
 'asset',
 'Rojo',
 'welfarereform',
 'AfAm',
 'elect',
 'nhf16',
 'revisiontaking',
 'expansion',
 'blow',
 'fat',
 'seafood',
 'flt',
 'class',
 'Fallon',
 'Jones',
 '_emoj_medium_white_circle_',
 'binary',
 'ab',
 'impressive',
 'introducing',
 'compelling',
 'DIE',
 'ethic',
 'ill',
 'badbusiness',
 'vandalizing',
 'sike',
 'Vols',
 'catching',
 'hut',
 'burns',
 'Love',
 'Notorious',
 'classmate',
 'Halifax',
 'ressssst',
 'Making',
 'moretears',
 'rioting',
 'cal',
 'checking',
 'permanent',
 'pupper',
 'inept',
 'THINGS',
 'darksocial',
 'DETROIT',
 'Idina',
 'tweets',
 'September',
 'solo',
 'IPCA',
 'Vespers',
 'lameduck',
 'jam',
 'crookedstuck',
 'duck',
 'urghh',
 'Troles',
 'bibleverse',
 'absurd',
 'moment',
 'ref

In [81]:
len(anger_features)

14546