In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r'C:\Users\Selva\Desktop\Cyrus\OneDrive_1_9-4-2019\Predict rating.csv')

In [3]:
pd.set_option('display.max_colwidth', -1)

In [4]:
df[['review_title','review_body','rating','staff_rating','atmos_rating','bud_rating']].sample(25)

Unnamed: 0,review_title,review_body,rating,staff_rating,atmos_rating,bud_rating
160891,THE best in S.D.,STAFF is amazing always ðJenið,5.0,5.0,5.0,5.0
154211,Great spot great prices,"Great service, good quality strains. Happy hour is the time to go",5.0,5.0,5.0,5.0
25429,Great new place!!!,"Ordered yesterday because I broke my bong and didn't feel like going out. Luckily they had a great selection of bongs!! I got a 16"" with two percs and an ice catcher I love it, what other place has good bongs & will deliver them same day!? I got lemon venom & durban poison both good bud & I got 2 packs of caramel edibles for free!",5.0,5.0,,5.0
36043,best,"my go to shop, good variety of bud, bud tenders give good suggestions ð",5.0,5.0,5.0,5.0
87390,,,,,,
95090,New management is great,like this plase the have some great deals on some good buds,5.0,5.0,5.0,5.0
43961,nice fast service,for a ftp I love the service and the flowers was also wonderful I will most definitely use you guys again.,5.0,5.0,,5.0
133754,I love AMG !!,It has become my favorite shop and I don't even live close to it lol. The drive is always worth it. Great meds at great prices. Thanks AMG !!,5.0,5.0,5.0,5.0
114485,always fire!,get selection and great people never have a bad time going to this place. HIGHLY SUGGESTED!,5.0,5.0,5.0,5.0
81438,Best Place Ever,"Service is always quick, security is amazing (shoutout to Ethan - he remembered me after just a couple visits... what a gentleman!) and to Austin who helped my sister and I get everything we need! Iâll never go anywhere else!",5.0,5.0,5.0,5.0


## 1.Predict Rating

In [5]:
rating_df= df[['review_title','review_body','rating']]

In [6]:
rating_df.head()

Unnamed: 0,review_title,review_body,rating
0,BEST QUALITY FOR BEAUTIFUL PRICE,Best quality of bud i have ever seen every jar was filled with sticky potent weed not to mention the fire waxes! My new spot fosho,5.0
1,FTP,"Cool shop. Nice meds, prices, and staff.",5.0
2,great place,good product great staff friendly atmosphere,5.0
3,OC HOTBOX GOT ð¥,"Bomb meds and staff, headed to the hot box now and so should you!",5.0
4,Great place.,"1st time client. Easy location, great atmosphere. The staff was pleasant and I worked with Tyler who was very knowledgable about his products. I felt comfortable purchasing, definitely will return.",5.0


#### Checking null values

In [7]:
rating_df.isnull().values.any()

True

In [8]:
rating_df.shape

(173101, 3)

In [9]:
rating_df = rating_df.dropna()
rating_df.reset_index(inplace=True,drop=True)

In [10]:
rating_df.shape

(161131, 3)

In [11]:
rating_df.isnull().values.any()

False

#### Cleaning Data

In [12]:
for index,row in rating_df.iterrows():
    rating_df.at[index,'review_title'] = ''.join([char if ord(char) < 128 else '' for char in row['review_title']])
    rating_df.at[index,'review_body'] = ''.join([char if ord(char) < 128 else '' for char in row['review_body']])

In [13]:
rating_df['review_title'].replace('',np.nan,inplace=True)
rating_df['review_body'].replace('',np.nan,inplace=True)

In [14]:
rating_df.isnull().values.any()

True

In [15]:
rating_df.dropna(inplace=True)
rating_df.reset_index(inplace=True,drop=True)

In [16]:
rating_df.shape

(157684, 3)

In [17]:
rating_df.head()

Unnamed: 0,review_title,review_body,rating
0,BEST QUALITY FOR BEAUTIFUL PRICE,Best quality of bud i have ever seen every jar was filled with sticky potent weed not to mention the fire waxes! My new spot fosho,5.0
1,FTP,"Cool shop. Nice meds, prices, and staff.",5.0
2,great place,good product great staff friendly atmosphere,5.0
3,OC HOTBOX GOT,"Bomb meds and staff, headed to the hot box now and so should you!",5.0
4,Great place.,"1st time client. Easy location, great atmosphere. The staff was pleasant and I worked with Tyler who was very knowledgable about his products. I felt comfortable purchasing, definitely will return.",5.0


#### Generating Features

In [18]:
rating_df['review'] =  rating_df['review_title'] + ' ' + rating_df['review_body']
rating_df = rating_df[['review','rating']]

In [19]:
rating_df.head()

Unnamed: 0,review,rating
0,BEST QUALITY FOR BEAUTIFUL PRICE Best quality of bud i have ever seen every jar was filled with sticky potent weed not to mention the fire waxes! My new spot fosho,5.0
1,"FTP Cool shop. Nice meds, prices, and staff.",5.0
2,great place good product great staff friendly atmosphere,5.0
3,"OC HOTBOX GOT Bomb meds and staff, headed to the hot box now and so should you!",5.0
4,"Great place. 1st time client. Easy location, great atmosphere. The staff was pleasant and I worked with Tyler who was very knowledgable about his products. I felt comfortable purchasing, definitely will return.",5.0


In [20]:
vect = TfidfVectorizer(input='content',stop_words='english',analyzer='word', ngram_range=(1,2),
                     min_df = 0, sublinear_tf=True)
X = vect.fit_transform(rating_df.review)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [21]:
print(X.shape)

(157684, 861273)


In [22]:
rating_df['rating'] = [int(round(i)) for i in rating_df.rating]

#### Modelling

In [25]:
x_train, x_test, y_train, y_test = train_test_split(X, rating_df.rating, test_size=0.25, random_state = 7)
clf = RandomForestClassifier(n_estimators=10, random_state=7)

In [26]:
clf.fit(x_train,y_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [27]:
pred = clf.predict(x_test)
print(accuracy_score(y_test, pred))

0.9277542426625403


In [41]:
print(X.shape)

(157684, 861273)


In [42]:
feature_names = vect.get_feature_names()
doc = 0
feature_index = X[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)


best 0.07546898200336703
quality 0.10760262178310111
beautiful 0.11498527691088482
price 0.08211682374339449
bud 0.06043693638822543
seen 0.11480358405286836
jar 0.14528002247711122
filled 0.15768992962371342
sticky 0.13374794917480448
potent 0.12077591176826957
weed 0.07649258983818175
mention 0.12378750624955498
waxes 0.1477023284322686
new 0.08318324843150021
spot 0.0710254848200214
fosho 0.19422214424379894
best quality 0.2022251202325756
quality beautiful 0.22601745844671559
beautiful price 0.23749249359633015
price best 0.16503353441052007
quality bud 0.11715585704623686
bud seen 0.2103896222156613
seen jar 0.25170561701467703
jar filled 0.24339147279682244
filled sticky 0.25170561701467703
sticky potent 0.2208642051606209
potent weed 0.21674941188000046
weed mention 0.22917834937847553
mention waxes 0.25170561701467703
waxes new 0.25170561701467703
new spot 0.1299823033403372
spot fosho 0.25170561701467703
