In [1]:
# analysis using FastText
# fasttext is used for supervised classification

In [2]:
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

In [3]:
from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'



In [4]:
import fasttext

In [5]:
#this will be supervised so lets divide the data into train and test. 

In [6]:
sentence_df=pd.read_csv("sentence_df.csv")

In [7]:
#splitting dataframe
msk = np.random.rand(len(sentence_df)) < 0.7
train=sentence_df[msk]
test=sentence_df[~msk]

In [8]:
train['sentiment'].value_counts()

1    1057
0    1056
Name: sentiment, dtype: int64

In [9]:
test['sentiment'].value_counts()

0    444
1    443
Name: sentiment, dtype: int64

In [10]:
train=train.reset_index(drop=True)
test=test.reset_index(drop=True)

In [11]:
#sentiment is a target variable

In [12]:
indon=open("train.txt","w")
n=train.shape[0]
for i in range(n):
    indon.write(train.ix[i,'Sentence']+" "+"__label__"+str(train.ix[i,'sentiment']))
    indon.write("\n")
indon.close()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [13]:
test_list=test['Sentence'].tolist()

In [14]:
classifier = fasttext.supervised('train.txt', 'model')

In [15]:
#predicting labels
labels = classifier.predict(test_list)
print("count of negative sentences")
labels.count([u'0'])

count of negative sentences


451

In [16]:
print("count of positive sentences")
labels.count([u'1'])

count of positive sentences


436

In [17]:
test['predicted']=labels

In [18]:
test['predicted']=test['predicted'].apply(lambda x: int(x[0]))

In [19]:
from sklearn.metrics import classification_report
print (classification_report(test['sentiment'],test['predicted']))

             precision    recall  f1-score   support

          0       0.65      0.66      0.66       444
          1       0.66      0.65      0.65       443

avg / total       0.65      0.65      0.65       887



In [20]:
import re

In [21]:
#Preprocessing and classification
train['Sentence']=train['Sentence'].astype(str)
train['Sentence']=train['Sentence'].apply(lambda x: re.sub(ur"[^\w\d'\s]+",'',x))
train['Sentence']=train['Sentence'].apply(lambda x: x.lower())

In [22]:
indon=open("train_new.txt","w")
n=train.shape[0]
for i in range(n):
    indon.write(train.ix[i,'Sentence']+" "+"__label__"+str(train.ix[i,'sentiment']))
    indon.write("\n")
indon.close()

In [23]:
test['Sentence']=test['Sentence'].astype(str)
test['Sentence']=test['Sentence'].apply(lambda x: re.sub(ur"[^\w\d'\s]+",'',x))
test['Sentence']=test['Sentence'].apply(lambda x: x.lower())

In [24]:
test_list=test['Sentence'].tolist()

In [25]:
classifier = fasttext.supervised('train_new.txt', 'model')

In [26]:
#predicting labels
labels = classifier.predict(test_list)
print("count of negative sentences")
labels.count([u'0'])

count of negative sentences


419

In [27]:
print("count of positive sentences")
labels.count([u'1'])

count of positive sentences


468

In [28]:
test['predicted']=labels

In [29]:
test['predicted']=test['predicted'].apply(lambda x: int(x[0]))

In [30]:
from sklearn.metrics import classification_report
print (classification_report(test['sentiment'],test['predicted']))

             precision    recall  f1-score   support

          0       0.71      0.67      0.69       444
          1       0.69      0.72      0.70       443

avg / total       0.70      0.70      0.70       887



In [31]:
#Preprocessing helped