In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import train_test_split,cross_val_predict,cross_val_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.pipeline import Pipeline,FeatureUnion

from sklearn.base import BaseEstimator,TransformerMixin

In [3]:
df = pd.read_excel('Whisper data.xlsx',sheetname='data')
np.random.seed(seed=1)
# Shuffling our data first
df = df.iloc[np.random.permutation(len(df))]
df.reset_index(drop=True,inplace=True)
features = df['Sentence']
targets = df.ix[:,-5:]

df.head()

Unnamed: 0,Sentence,Pets,Secrets,Happy,Sad,Seasonal
0,"Has anyone seen ""life is beautiful""?",0,0,1,0,0
1,Does anyone want to chat,0,0,0,0,0
2,My heart is banging like a drum. My head is wh...,0,1,0,1,0
3,"Feelings are overrated you guys, we're all goi...",0,0,1,0,0
4,Anyone in Auckland got some green to share? F19,0,0,0,0,0


In [204]:
pipe = Pipeline([('CountVec',CountVectorizer(stop_words='english')),
                 ('model',RandomForestClassifier(random_state=1))])

pred = cross_val_predict(pipe,features,targets)

print classification_report(targets,pred,target_names=df.columns[-5:])

pipe.fit(features,targets)
sample = ['I love my life','I hate my life']
sample_predictions = pipe.predict(pd.Series(sample))

tempdf = pd.DataFrame(sample_predictions,columns=df.columns[-5:])
tempdf['Sample Sentence'] = sample
tempdf

             precision    recall  f1-score   support

       Pets       1.00      0.10      0.18        10
    Secrets       0.90      0.15      0.26        59
      Happy       0.00      0.00      0.00        32
        Sad       0.33      0.03      0.05        36
   Seasonal       1.00      0.20      0.33        10

avg / total       0.58      0.09      0.15       147



Unnamed: 0,Pets,Secrets,Happy,Sad,Seasonal,Sample Sentence
0,0.0,1.0,1.0,0.0,0.0,I love my life
1,0.0,1.0,0.0,1.0,0.0,I hate my life


In [111]:
class AddWordCount(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def CountWords(self,sentence):
        return len(sentence.split(' '))
    def fit(self,df,y=None):
        return self
    def transform(self,series,y=None):
        return np.array(series.apply(self.CountWords)).reshape(-1,1)

In [203]:
pipe = Pipeline([('getFeatures',FeatureUnion([('addWordCount',AddWordCount()),
                                              ('CountVec',TfidfVectorizer(stop_words='english'))])),
                 ('model',RandomForestClassifier(random_state=1))])

pred = cross_val_predict(pipe,features,targets)

print classification_report(targets,pred,target_names=df.columns[-5:])

pipe.fit(features,targets)
sample = ['I love my life','I hate my life']
sample_predictions = pipe.predict(pd.Series(sample))

tempdf = pd.DataFrame(sample_predictions,columns=df.columns[-5:])
tempdf['Sample Sentence'] = sample
tempdf

             precision    recall  f1-score   support

       Pets       1.00      0.10      0.18        10
    Secrets       0.80      0.73      0.76        59
      Happy       0.50      0.03      0.06        32
        Sad       0.32      0.33      0.33        36
   Seasonal       0.00      0.00      0.00        10

avg / total       0.58      0.39      0.41       147



Unnamed: 0,Pets,Secrets,Happy,Sad,Seasonal,Sample Sentence
0,0.0,1.0,0.0,0.0,0.0,I love my life
1,0.0,0.0,0.0,0.0,0.0,I hate my life
