# Using Random Forest as the classifier!

In [27]:
import pandas as pd
import nltk
import numpy as np
from wordcloud import WordCloud, STOPWORDS
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from nltk import word_tokenize
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
import pickle

# Preprocessing!

In [None]:
def plt_count_freq(counts,title=None,_ylim=100000):
    freq = list(set(counts.values()))
    freq.sort()
    freq.reverse()
    plt.plot(freq)
    plt.xlabel('Terms')
    plt.ylabel('freq count')
    plt.ylim(0,_ylim)
    if(title!=None):
        plt.title(title)
    plt.show()


In [None]:
neg_words = ' '.join(dummy_neg['Description'])
pos_words = ' '.join(dummy_pos['Description'])
neg_words_list = word_tokenize(neg_words)
pos_words_list = word_tokenize(pos_words)
neg_counts = Counter(neg_words_list)
pos_counts = Counter(pos_words_list)
neg_most_common = neg_counts.most_common()
pos_most_common = pos_counts.most_common()

In [None]:
plt_count_freq(counts,'Corpus')
plt_count_freq(pos_counts,'Positive words',40000)
plt_count_freq(neg_counts,'Negative Words',20000)

In [None]:
def create_tf_high_stoplist(pos_counts,neg_counts,thresh=20000):
    pos_words_thresh = []
    stoplist = []
    for word,freq in pos_counts.most_common():
        if freq>=thresh:
            pos_words_thresh.append(word)
        else:
            break
    for word,freq in neg_counts.most_common():
        if(freq<thresh):
            break
        elif(word in pos_words_thresh):
            stoplist.append(word)
    return stoplist    
    

In [None]:
tf_high_stoplist = create_tf_high_stoplist(pos_counts,neg_counts,7500)

In [4]:
def clean_data(inputDF, tf1 = False,stops_bool = False,tf_high=False, updates=[],*tf_high_args):

    if tf1:
        file = open("TF1_cleaned_input.p",'rb')
        tf1_cleaned_data = pickle.load(file)
        file.close()
        stops = set()
        if stops_bool:
            stops = set(STOPWORDS)
            if updates:
                assert isinstance(updates,list)
                stops.update(updates)
            if tf_high:
                stops.update(create_tf_high_stoplist(*tf_high_args))
        else:
            if tf_high:
                stops.update(create_tf_high_stoplist(*tf_high_args))
            if updates:
                assert isinstance(updates,list)
                stops.update(updates)
        count_vec = CountVectorizer(analyzer = "word",stop_words= stops,max_features=500)
        count_vec_ngrams = CountVectorizer(analyzer = "word", stop_words = stops , ngram_range = (1,2),max_features=500)
        
    else:
        
        pass
        


In [5]:
df = pd.read_csv('train.csv')

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [19]:
dummy_neg = df[df['Is_Response'] == 'not happy']
dummy_pos = df[df['Is_Response'] == 'not happy']


In [8]:
df['sentiment']  = df['Is_Response'].apply(
lambda x : 0 if x == 'not happy' else 1)

In [9]:
df['sentiment'].value_counts()

1    26521
0    12411
Name: sentiment, dtype: int64

In [10]:
inputDF = df['Description']
outputDF = df['sentiment']

In [11]:
file = open("TF1_cleaned_input.p",'rb')
tf1_cleaned_data = pickle.load(file)
file.close()


In [12]:
len(tf1_cleaned_data)

38932

In [13]:
inputDF = tf1_cleaned_data

In [14]:
words = ' '.join(inputDF)

In [15]:
x_train,x_test,y_train,y_test = train_test_split(inputDF,outputDF,random_state=42)

In [23]:
stops = set(STOPWORDS)
stops.update(tf_high_stoplist)

In [37]:
count_vec = CountVectorizer(analyzer = "word",stop_words=stops)
count_vec_ngrams = CountVectorizer(analyzer = "word", ngram_range = (1,2),stop_words=stops)

In [38]:
train_features = count_vec.fit_transform(x_train)
train_features_grams = count_vec_ngrams.fit_transform(x_train)

In [39]:
test_features = count_vec.transform(x_test)
test_features_grams = count_vec_ngrams.transform(x_test)

# Random Forest

In [40]:
clf = RandomForestClassifier(random_state=42)
clf_grams = RandomForestClassifier(random_state=42)

In [41]:
clf.fit(train_features,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [42]:
clf.score(test_features,y_test)

0.81064420014384053

In [43]:
clf_grams.fit(train_features_grams,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [44]:
clf_grams.score(test_features_grams,y_test)

0.79605465940614406

## Decision Tree

In [75]:
clf = DecisionTreeClassifier()
clf_grams = DecisionTreeClassifier()
clf.fit(train_features,y_train)
clf_grams.fit(train_features_grams,y_train)
print(clf_grams.score(test_features_grams,y_test))
print(clf.score(test_features,y_test))

0.756087537244
0.747970820919


## MultinomialNB

In [76]:
clf = MultinomialNB()
clf_grams = MultinomialNB()
clf_grams.fit(train_features_grams,y_train)
clf.fit(train_features,y_train)
print(clf_grams.score(test_features_grams,y_test))
clf.score(test_features,y_test)

0.870440768519


0.86098839001335659

In [77]:
TF = TfidfVectorizer()
train_features = TF.fit_transform(x_train)
test_features = TF.transform(x_test)


In [78]:
clf = DecisionTreeClassifier()
clf.fit(train_features,y_train)
clf.score(test_features,y_test)

0.74190896948525631

In [79]:
clf = MultinomialNB()
clf.fit(train_features,y_train)
clf.score(test_features,y_test)

0.82862426795438204

In [130]:
list_words = word_tokenize(words)

In [139]:
counts = Counter(list_words)

In [177]:
tf_high_stoplist = []

### both have set of words whose frequencies higher than 20,000, so making a stoplist of those common keys, terms!

## Random Forest

In [45]:
train_features = count_vec.fit_transform(inputDF)

In [47]:
train_features.shape

(38932, 24081)

In [49]:
testDF = pd.read_csv('test.csv')
testing_df = testDF['Description']

In [50]:
test_features = count_vec.transform(testing_df)

In [52]:
y_train = df['sentiment']

In [53]:
clf = RandomForestClassifier()
clf.fit(train_features,y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [54]:
prediction = clf.predict(test_features)


In [55]:
prediction.shape

(29404,)

In [56]:
testDF['Is_Response'] = prediction
testDF['Is_Response'] = testDF['Is_Response'].apply(lambda x:'not_happy' if x==0 else 'happy') 
submissionDF = testDF[['User_ID','Is_Response']]
submissionDF.to_csv('submissions/submission_4.csv',columns=['User_ID','Is_Response'],index=False)

# 80.878 % accuracy with RandomForestClassifier.