In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

In [28]:
all_tweets = pd.read_csv('postProcessedText.csv')

In [30]:
all_tweets = all_tweets.Tweet

In [7]:
summarization_50 = pd.read_csv('data/summarized_50_percent.csv')

In [8]:
summarization_50.shape

(3557, 12)

In [9]:
summarization_50.columns

Index(['Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
       'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [10]:
tweets = summarization_50.Tweet

In [16]:
emotions = summarization_50.drop(labels = 'Tweet', axis =1)

In [17]:
emotions.columns

Index(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism',
       'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [22]:
vectorizer = TfidfVectorizer(strip_accents = "unicode", analyzer = "word", ngram_range = (1,1), norm = "l2")

In [31]:
vectorizer.fit(all_tweets)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [32]:
vect_tweets = vectorizer.transform(tweets)

In [35]:
def get_vocab(vectorizer):
    return vectorizer.vocabulary_

In [36]:
vocab = get_vocab(vectorizer)

In [54]:
vocab = {i:v for v,i in vocab.items()}

In [40]:
RF_pipeline = Pipeline([('rf', OneVsRestClassifier(RandomForestClassifier(n_estimators = 300,
                                                                        max_depth = 5,
                                                                       random_state = 0))),])


In [41]:
def get_forest(pipeline, X, y):
    pipeline.fit(X,y)
    forest = RF_pipeline.named_steps['rf'].estimators_[0].estimators_ 
    return forest


In [55]:
def get_all_features(forest, vocab):
    all_important_features = set()
    for tree in forest:
        f_importance = np.array(tree.feature_importances_)
        # sorted indices
        sort_f_indices = sorted(range(len(f_importance)), key = lambda k: f_importance[k], reverse = True)
        sort_f_importance = f_importance[sort_f_indices]
        # non_zero importance
        non_zero_sort_f_importance = [x for x in sort_f_importance if x]
        non_zero_sort_f_indices = sort_f_indices[:len(non_zero_sort_f_importance)]
        for i in non_zero_sort_f_indices:
            feature = vocab[i]
            all_important_features.add(feature)
    return all_important_features

## Anger

In [56]:
anger_forest = get_forest(RF_pipeline, vect_tweets, emotions['anger'])

In [57]:
anger_features = get_all_features(anger_forest, vocab)

In [59]:
len(anger_features)

1273

## Joy

In [63]:
joy_forest = get_forest(RF_pipeline, vect_tweets, emotions['joy'])
joy_features = get_all_features(joy_forest, vocab)

In [64]:
len(joy_features)

1072

## Trust

In [65]:
trust_forest = get_forest(RF_pipeline, vect_tweets, emotions['trust'])
trust_features = get_all_features(trust_forest, vocab)

In [66]:
len(trust_features)

899

## Pessimism

In [67]:
pessimism_forest = get_forest(RF_pipeline, vect_tweets, emotions['pessimism'])
pessimism_features = get_all_features(pessimism_forest, vocab)

In [68]:
len(pessimism_features)

1181

## Optimism

In [69]:
optimism_forest = get_forest(RF_pipeline, vect_tweets, emotions['optimism'])
optimism_features = get_all_features(optimism_forest, vocab)

In [70]:
len(optimism_features)

1217

## Fear

In [71]:
fear_forest = get_forest(RF_pipeline, vect_tweets, emotions['fear'])
fear_features = get_all_features(fear_forest, vocab)

In [72]:
len(fear_features)

1214

## Disgust

In [73]:
disgust_forest = get_forest(RF_pipeline, vect_tweets, emotions['disgust'])
disgust_features = get_all_features(disgust_forest, vocab)

In [74]:
len(disgust_features)

1338

## Love

In [75]:
love_forest = get_forest(RF_pipeline, vect_tweets, emotions['love'])
love_features = get_all_features(love_forest, vocab)

In [76]:
len(love_features)

1165

## Sadness

In [80]:
sadness_forest = get_forest(RF_pipeline, vect_tweets, emotions['sadness'])
sadness_features = get_all_features(sadness_forest, vocab)

In [81]:
len(sadness_features)

1272

## Surprise

In [82]:
surprise_forest = get_forest(RF_pipeline, vect_tweets, emotions['surprise'])
surprise_features = get_all_features(surprise_forest, vocab)

In [83]:
len(surprise_features)

915

## Anticipation

In [84]:
anticipation_forest = get_forest(RF_pipeline, vect_tweets, emotions['anticipation'])
anticipation_features = get_all_features(anticipation_forest, vocab)

In [85]:
len(anticipation_features)

1161