In [138]:
import pandas as pd
import seaborn as sns
import numpy as np

In [139]:
df =  pd.DataFrame( [('Awesome food','p'),
                     ('Pathatic service. Not worth at all.','n'),
                     ('Very good service. Fantanstic Gym','p'),
                     ('I like everything about it','p'),
                     ('Bad room service. Costly food. Unfriendly staff.','n'),
                     ('Untidy room. Bad room service.','n'),
                     ('Worst food. Unfriendly staff.','n'),
                    ])

In [140]:
df.columns = ('review','type')

In [141]:
df

Unnamed: 0,review,type
0,Awesome food,p
1,Pathatic service. Not worth at all.,n
2,Very good service. Fantanstic Gym,p
3,I like everything about it,p
4,Bad room service. Costly food. Unfriendly staff.,n
5,Untidy room. Bad room service.,n
6,Worst food. Unfriendly staff.,n


In [142]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

In [143]:
X = df['review']
y = df['type']

In [144]:
# tf-idf means term-frequency times inverse document-frequency. 
tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
tfidf.fit(X)
features = tfidf.transform(X)

In [145]:
features.shape

(7, 16)

In [146]:
df_features = pd.DataFrame(features.toarray())

In [147]:
column_names = tfidf.get_feature_names()

In [148]:
df_features.columns = column_names

In [149]:
df_features['type'] = y

In [150]:
df_features

Unnamed: 0,awesome,bad,costly,fantanstic,food,good,gym,like,pathatic,room,service,staff,unfriendly,untidy,worst,worth,type
0,0.815564,0.0,0.0,0.0,0.578667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,p
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.648275,0.0,0.39935,0.0,0.0,0.0,0.0,0.648275,n
2,0.0,0.0,0.0,0.54397,0.0,0.54397,0.54397,0.0,0.0,0.0,0.335096,0.0,0.0,0.0,0.0,0.0,p
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,p
4,0.0,0.385396,0.464284,0.0,0.329424,0.0,0.0,0.0,0.0,0.385396,0.286008,0.385396,0.385396,0.0,0.0,0.0,n
5,0.0,0.37791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75582,0.280453,0.0,0.0,0.455266,0.0,0.0,n
6,0.0,0.0,0.0,0.0,0.417984,0.0,0.0,0.0,0.0,0.0,0.0,0.489004,0.489004,0.0,0.5891,0.0,n


In [151]:
df_positives = df_features.loc[df_features.type == 'p'].drop(columns = ['type'])
df_negatives = df_features.loc[df_features.type == 'n'].drop(columns = ['type'])

In [152]:
df_positives

Unnamed: 0,awesome,bad,costly,fantanstic,food,good,gym,like,pathatic,room,service,staff,unfriendly,untidy,worst,worth
0,0.815564,0.0,0.0,0.0,0.578667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.54397,0.0,0.54397,0.54397,0.0,0.0,0.0,0.335096,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
# Take all columns except last column
for row in df_positives.iterrows():
    for idx,value in enumerate(row[1]):
        if value > .5:
            print(column_names[idx], value)

awesome 0.8155639275319199
food 0.5786669855009094
fantanstic 0.5439701462793114
good 0.5439701462793114
gym 0.5439701462793114
like 1.0


In [154]:
for row in df_negatives.iterrows():
    for idx,value in enumerate(row[1]):
        if isinstance(value,float) and value > .5:
            print(column_names[idx], value)

pathatic 0.6482745601805321
worth 0.6482745601805321
room 0.7558196722753507
worst 0.5891004391952713
