In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA


In [25]:
# read data from csv into dataframes
airbnb = pd.read_csv('reviews_detail.csv')

In [26]:
airbnb.shape
Output: (10000, 10)

In [27]:
airbnb.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,9452127,58944174,2016-01-05,6279455,Ravi,Great place. Helena was prompt to reply and ve...
1,9452127,59446510,2016-01-10,23770684,Shiv,We had previously stayed at the apartment over...
2,9452127,59557387,2016-01-11,29070567,Jandall,Helena was a great host and extremely accommod...
3,9452127,62664325,2016-02-15,58381183,Cody,Helena was extremely accessible throughout the...
4,9452127,63410004,2016-02-22,29014080,Magnus,Awesome place! Looks just as in photos. Plenty...


In [28]:
airbnb.describe()

Unnamed: 0,listing_id,id,reviewer_id
count,801784.0,801784.0,801784.0
mean,7181406.0,99459650.0,42319550.0
std,5965983.0,58366030.0,39040140.0
min,2515.0,198.0,1.0
25%,1584167.0,49593750.0,9999482.0
50%,5918025.0,102766200.0,29678910.0
75%,12231960.0,150366000.0,64691940.0
max,21154540.0,199715000.0,152696800.0


In [29]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801784 entries, 0 to 801783
Data columns (total 6 columns):
listing_id       801784 non-null int64
id               801784 non-null int64
date             801784 non-null object
reviewer_id      801784 non-null int64
reviewer_name    801784 non-null object
comments         801007 non-null object
dtypes: int64(3), object(3)
memory usage: 36.7+ MB


In [30]:
airbnb.comments.dtype

dtype('O')

In [31]:
# converting the comments/reviews data into string for later evaluation
airbnb['comments'] = airbnb['comments'].astype(str)


In [32]:
airbnb['text length'] = airbnb['comments'].apply(len)
airbnb.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,text length
0,9452127,58944174,2016-01-05,6279455,Ravi,Great place. Helena was prompt to reply and ve...,172
1,9452127,59446510,2016-01-10,23770684,Shiv,We had previously stayed at the apartment over...,456
2,9452127,59557387,2016-01-11,29070567,Jandall,Helena was a great host and extremely accommod...,210
3,9452127,62664325,2016-02-15,58381183,Cody,Helena was extremely accessible throughout the...,426
4,9452127,63410004,2016-02-22,29014080,Magnus,Awesome place! Looks just as in photos. Plenty...,205


In [35]:
x = airbnb
# bow_transformer = CountVectorizer(analyzer=text_process).fit(X)

sid = SIA()
def posit(x):
    p = sid.polarity_scores(x)
    return p['pos']

def negit(x):
    p = sid.polarity_scores(x)
    return p['neg']

def neuit(x):
    p = sid.polarity_scores(x)
    return p['neu']


x['pos'] = x['comments'].apply(posit)
x['neg'] = x['comments'].apply(negit)
x['neu'] = x['comments'].apply(neuit)


#     if (abs(p['pos'] - p['neg']) <= 0.05):
#         a = 'neutral'
#     elif (p['pos'] - p['neg'] > 0.05):
#         a = 'positive'
#     else:
#         a ='negative'
#     print (a)
#     airbnb['type'] = append[a]

x.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,text length,pos,neg,neu
0,9452127,58944174,2016-01-05,6279455,Ravi,Great place. Helena was prompt to reply and ve...,172,0.308,0.0,0.692
1,9452127,59446510,2016-01-10,23770684,Shiv,We had previously stayed at the apartment over...,456,0.17,0.06,0.77
2,9452127,59557387,2016-01-11,29070567,Jandall,Helena was a great host and extremely accommod...,210,0.312,0.0,0.688
3,9452127,62664325,2016-02-15,58381183,Cody,Helena was extremely accessible throughout the...,426,0.113,0.0,0.887
4,9452127,63410004,2016-02-22,29014080,Magnus,Awesome place! Looks just as in photos. Plenty...,205,0.349,0.0,0.651


In [36]:
# Aggregating total positives of listing ids
df = x.groupby('listing_id', sort=False)["pos"].sum().reset_index(name="Total positive")
df.head()


Unnamed: 0,listing_id,Total positive
0,9452127,7.279
1,6184827,1.15
2,6921831,2.114
3,766542,1.193
4,6078066,17.814


In [37]:
x.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,text length,pos,neg,neu
0,9452127,58944174,2016-01-05,6279455,Ravi,Great place. Helena was prompt to reply and ve...,172,0.308,0.0,0.692
1,9452127,59446510,2016-01-10,23770684,Shiv,We had previously stayed at the apartment over...,456,0.17,0.06,0.77
2,9452127,59557387,2016-01-11,29070567,Jandall,Helena was a great host and extremely accommod...,210,0.312,0.0,0.688
3,9452127,62664325,2016-02-15,58381183,Cody,Helena was extremely accessible throughout the...,426,0.113,0.0,0.887
4,9452127,63410004,2016-02-22,29014080,Magnus,Awesome place! Looks just as in photos. Plenty...,205,0.349,0.0,0.651


In [38]:
#  Copying the required fields into new data frame
keeper = x[['pos', 'neg', 'neu']].copy()
keeper.head()

Unnamed: 0,pos,neg,neu
0,0.308,0.0,0.692
1,0.17,0.06,0.77
2,0.312,0.0,0.688
3,0.113,0.0,0.887
4,0.349,0.0,0.651


In [39]:
# Writing the sentiment analysis file into a csv file
keeper.to_csv("sentiments_store.csv", sep='\t', encoding='utf-8')