In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import scipy.stats as st
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'



In [2]:
#importing data
data = pd.read_table('/home/sanju/Downloads/Aspect_Classification/aspect_annoated_file.txt',header = None)
data.head()

Unnamed: 0,0,1
0,Judging,NASP
1,from,NASP
2,previous,NASP
3,posts,NASP
4,this,NASP


In [3]:
#renaming columns
data.columns = ['Word','Aspect']
data.loc[data.Aspect == 'NASP','Aspect'] = 0
data.loc[data.Aspect == 'ASP','Aspect'] = 1
data.head()

Unnamed: 0,Word,Aspect
0,Judging,0
1,from,0
2,previous,0
3,posts,0
4,this,0


In [4]:
#converting data type of Aspect to int so that can be used as feature ir target later
data.Aspect = data.Aspect.apply(pd.to_numeric, errors='ignore')
data.dtypes

Word      object
Aspect     int64
dtype: object

In [5]:
# importing stopwords
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
stopset = set(stoplist)

In [6]:
stop = []
for i in range(0,len(data.Word)):
    if data.Word[i] in stopset:
        stop.append(True)
    else:
        stop.append(False)
# adding a new column to dataset which tells if a word belongs to list of stopwords or not
data['StopWords'] = pd.Series(stop)

In [7]:
data.head()

Unnamed: 0,Word,Aspect,StopWords
0,Judging,0,False
1,from,0,True
2,previous,0,False
3,posts,0,False
4,this,0,True


In [8]:
data.shape

(13845, 3)

In [9]:
# tagging the appropriate parts of speech
import nltk
from nltk import pos_tag
pos_pist = pos_tag(data.Word)
print pos_pist[:5]
pos_list = []
for i in range(0,len(pos_pist)):
    pos_list.append(pos_pist[i][1])
print pos_list[:5]

[('Judging', 'VBG'), ('from', 'IN'), ('previous', 'JJ'), ('posts', 'NNS'), ('this', 'DT')]
['VBG', 'IN', 'JJ', 'NNS', 'DT']


In [10]:
# appending POS as a column
data['POS'] = pd.Series(pos_list)

In [11]:
data.head(10)

Unnamed: 0,Word,Aspect,StopWords,POS
0,Judging,0,False,VBG
1,from,0,True,IN
2,previous,0,False,JJ
3,posts,0,False,NNS
4,this,0,True,DT
5,used,0,False,VBN
6,to,0,True,TO
7,be,0,True,VB
8,a,0,True,DT
9,good,0,False,JJ


In [15]:
# named entity tagging for each word, very useful as a feature for classification later.
from nltk import ne_chunk
ner_pist = ne_chunk(pos_pist)
ner_list = []
for i in range(0,len(ner_pist)):
    if isinstance(ner_pist[i],nltk.Tree):
        ner_list.append(ner_pist[i].label())
    else:
        ner_list.append(0)
print ner_list[:5]

[0, 0, 0, 0, 0]


In [16]:
# appending NER as a column
data['NER'] = pd.Series(ner_list)

In [17]:
# peek at how aspects relate to the present features/columns
data.groupby('Aspect').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,NER,POS,StopWords,Word
Aspect,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,count,12394,12477,12477,12477
0,unique,6,38,2,2328
0,top,0,DT,False,.
0,freq,12071,1339,7501,998
1,count,1356,1368,1368,1368
1,unique,4,15,1,232
1,top,0,NN,False,food
1,freq,1319,1159,1368,129


In [19]:
# counts of top pos_tags
data.POS.value_counts().head()

NN    1985
DT    1339
JJ    1334
.     1125
IN    1096
Name: POS, dtype: int64

In [20]:
# some rare pos_tags will add noise to model better to remove them
unwanted_pos = ['CD','JJS','WDT',':','RP',')','WRB','JJR','(','WP','EX','PDT','RBR','RBS','UH','FW','NNPS','/''']

In [22]:
useful = []
for i in range(len(data)):
    useful.append(data.POS[i] not in unwanted_pos)
data['useful'] = pd.Series(useful)
new_data = pd.DataFrame(data[data.useful == True])

In [23]:
print new_data.shape
print data.shape

(13312, 6)
(13845, 6)


In [35]:
# getting dummy variables for POS
dummies = pd.get_dummies(new_data.POS)
new_data = pd.concat([new_data, dummies], axis=1)
new_data.head()

Unnamed: 0,Word,Aspect,StopWords,POS,NER,useful,'',",",.,CC,...,PRP,PRP$,RB,TO,VB,VBD,VBG,VBN,VBP,VBZ
0,Judging,0,False,VBG,0,True,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,from,0,True,IN,0,True,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,previous,0,False,JJ,0,True,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,posts,0,False,NNS,0,True,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,this,0,True,DT,0,True,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
new_data.NER.value_counts()

0               12874
PERSON            151
GPE               115
ORGANIZATION       81
FACILITY            2
LOCATION            1
Name: NER, dtype: int64

In [40]:
# getting dummy variables for NER
dummies = pd.get_dummies(new_data.NER)
new_data = pd.concat([new_data, dummies], axis=1)
new_data.head()

Unnamed: 0,Word,Aspect,StopWords,POS,NER,useful,'',",",.,CC,...,GPE,LOCATION,ORGANIZATION,PERSON,0,FACILITY,GPE.1,LOCATION.1,ORGANIZATION.1,PERSON.1
0,Judging,0,False,VBG,0,True,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,from,0,True,IN,0,True,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,previous,0,False,JJ,0,True,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,posts,0,False,NNS,0,True,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,this,0,True,DT,0,True,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [57]:
# all the columns i.e features, target 
list(new_data.columns)

['Word',
 'Aspect',
 'StopWords',
 'POS',
 'NER',
 'useful',
 "''",
 ',',
 '.',
 'CC',
 'DT',
 'IN',
 'JJ',
 'MD',
 'NN',
 'NNP',
 'NNS',
 'PRP',
 'PRP$',
 'RB',
 'TO',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 "''",
 ',',
 '.',
 'CC',
 'DT',
 'IN',
 'JJ',
 'MD',
 'NN',
 'NNP',
 'NNS',
 'PRP',
 'PRP$',
 'RB',
 'TO',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 0,
 'FACILITY',
 'GPE',
 'LOCATION',
 'ORGANIZATION',
 'PERSON',
 0,
 'FACILITY',
 'GPE',
 'LOCATION',
 'ORGANIZATION',
 'PERSON']

In [58]:
# X contains the data for features, Y contains for target
X = new_data[['StopWords','CC','DT','IN','JJ','MD','NN','NNP','NNS','PRP','PRP$','RB','TO','VB','VBD','VBG','VBN','VBP','VBZ','FACILITY','GPE','LOCATION','ORGANIZATION','PERSON']].values
Y = new_data['Aspect']

In [59]:
# importing classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import cross_val_score
from sklearn import svm

In [60]:
# initializing classifiers
clf = GaussianNB()
svm = svm.SVC(kernel='linear', probability=True)

In [61]:
# train test split
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25)

In [62]:
# fitting and predicting
svm.fit(X_train,Y_train)
Y_pred = svm.predict(X_test)
from sklearn.metrics import accuracy_score
print accuracy_score(Y_pred,Y_test)

0.92578125


In [63]:
clf.fit(X_train,Y_train)
Y_pred_nb = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print accuracy_score(Y_pred_nb,Y_test)

0.608774038462


In [64]:
# precision recall and F1 score.
from sklearn.metrics import classification_report
print(classification_report(Y_test, 
                            Y_pred, 
                            target_names=['Not Aspect', 'Aspect']))

             precision    recall  f1-score   support

 Not Aspect       0.98      0.94      0.96      2996
     Aspect       0.59      0.84      0.69       332

avg / total       0.94      0.93      0.93      3328



In [65]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, 
                            Y_pred_nb, 
                            target_names=['Not Aspect', 'Aspect']))

             precision    recall  f1-score   support

 Not Aspect       1.00      0.57      0.72      2996
     Aspect       0.20      0.99      0.34       332

avg / total       0.92      0.61      0.68      3328

