In [111]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, precision_score, recall_score, precision_recall_fscore_support
import matplotlib.pyplot as plt

In [2]:
all_tweets = pd.read_csv('postProcessedText.csv')

In [3]:
all_tweets = all_tweets.Tweet

In [4]:
summarization_50 = pd.read_csv('data/summarized_50_percent.csv')

In [51]:
summarization_50.Tweet[2]

'      I have never been more anxious of a election in my life  _emoj_double_exclamation_mark_ ufe0f '

In [5]:
summarization_50.shape

(3557, 12)

In [6]:
summarization_50.columns

Index(['Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
       'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [7]:
tweets = summarization_50.Tweet

In [8]:
emotions = summarization_50.drop(labels = 'Tweet', axis =1)

In [9]:
emotions.columns

Index(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism',
       'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [10]:
vectorizer = TfidfVectorizer(strip_accents = "unicode", analyzer = "word", ngram_range = (1,1), norm = "l2")

In [11]:
vectorizer.fit(all_tweets)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
vect_tweets = vectorizer.transform(tweets)

In [13]:
def get_vocab(vectorizer):
    return vectorizer.vocabulary_

In [14]:
vocab = get_vocab(vectorizer)

In [15]:
vocab = {i:v for v,i in vocab.items()}

In [16]:
RF_pipeline = Pipeline([('rf', OneVsRestClassifier(RandomForestClassifier(n_estimators = 300,
                                                                        max_depth = 5,
                                                                       random_state = 0))),])


In [112]:
def stats_report(y_true, y_pred):
    #tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    #precision = tp/(tp + fp)
    precision = precision_score(y_true, y_pred, average = 'weighted')
    #recall = tp/(tp + fn)
    recall = recall_score(y_true, y_pred, average = 'weighted')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average = "micro")
    f1 = f1_score(y_true, y_pred, average = 'weighted')
    print("Precision: {} \n Recall: {} \n F1 macro: {} \n F1 micro: {} \n F1: {} \n".format(precision, recall, f1_macro, f1_micro, f1))

In [113]:
def pred_report(pipeline, X, y):
    pipeline.fit(X,y)
    preds = pipeline.predict(X)
    stats_report(y, preds)
    return 

In [114]:
def get_forest(pipeline, X, y):
    pipeline.fit(X,y)
    forest = RF_pipeline.named_steps['rf'].estimators_[0].estimators_ 
    return forest


In [115]:
def get_all_features(forest, vocab):
    all_important_features = set()
    for tree in forest:
        f_importance = np.array(tree.feature_importances_)
        # sorted indices
        sort_f_indices = sorted(range(len(f_importance)), key = lambda k: f_importance[k], reverse = True)
        sort_f_importance = f_importance[sort_f_indices]
        # non_zero importance
        non_zero_sort_f_importance = [x for x in sort_f_importance if x]
        non_zero_sort_f_indices = sort_f_indices[:len(non_zero_sort_f_importance)]
        for i in non_zero_sort_f_indices:
            feature = vocab[i]
            all_important_features.add(feature)
    return all_important_features

## Anger

In [125]:
anger_forest = get_forest(RF_pipeline, vect_tweets, emotions['anger'])

In [126]:
anger_features = get_all_features(anger_forest, vocab)

In [127]:
len(anger_features)

1273

## Joy

In [128]:
joy_forest = get_forest(RF_pipeline, vect_tweets, emotions['joy'])
joy_features = get_all_features(joy_forest, vocab)

In [129]:
len(joy_features)

1072

## Trust

In [130]:
trust_forest = get_forest(RF_pipeline, vect_tweets, emotions['trust'])
trust_features = get_all_features(trust_forest, vocab)

In [131]:
len(trust_features)

899

## Pessimism

In [132]:
pessimism_forest = get_forest(RF_pipeline, vect_tweets, emotions['pessimism'])
pessimism_features = get_all_features(pessimism_forest, vocab)

In [133]:
len(pessimism_features)

1181

## Optimism

In [134]:
optimism_forest = get_forest(RF_pipeline, vect_tweets, emotions['optimism'])
optimism_features = get_all_features(optimism_forest, vocab)

In [135]:
len(optimism_features)

1217

## Fear

In [136]:
fear_forest = get_forest(RF_pipeline, vect_tweets, emotions['fear'])
fear_features = get_all_features(fear_forest, vocab)

In [137]:
len(fear_features)

1214

## Disgust

In [138]:
disgust_forest = get_forest(RF_pipeline, vect_tweets, emotions['disgust'])
disgust_features = get_all_features(disgust_forest, vocab)

In [139]:
len(disgust_features)

1338

## Love

In [140]:
love_forest = get_forest(RF_pipeline, vect_tweets, emotions['love'])
love_features = get_all_features(love_forest, vocab)

In [141]:
len(love_features)

1165

## Sadness

In [142]:
sadness_forest = get_forest(RF_pipeline, vect_tweets, emotions['sadness'])
sadness_features = get_all_features(sadness_forest, vocab)

In [143]:
len(sadness_features)

1272

## Surprise

In [145]:
surprise_forest = get_forest(RF_pipeline, vect_tweets, emotions['surprise'])
surprise_features = get_all_features(surprise_forest, vocab)

In [146]:
len(surprise_features)

915

## Anticipation

In [147]:
anticipation_forest = get_forest(RF_pipeline, vect_tweets, emotions['anticipation'])
anticipation_features = get_all_features(anticipation_forest, vocab)

In [148]:
len(anticipation_features)

1161

# Evaluation Metrics

In [151]:
def pred_report(pipeline, X, y):
    pipeline.fit(X,y)
    preds = pipeline.predict(X)
    stats_report(y, preds)
    return preds

In [153]:
preds = pred_report(RF_pipeline, vect_tweets, emotions)

Precision: 0.0 
 Recall: 0.0 
 F1 macro: 0.0 
 F1 micro: 0.0 
 F1: 0.0 



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
for p in preds:
    print(p)