In [114]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.inspection import * #permutation_importance
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.util import ngrams

In [38]:
df = pd.read_csv(r'C:\Users\Selva\Desktop\Cyrus\OneDrive_1_9-4-2019\Predict rating.csv')

In [39]:
pd.set_option('display.max_colwidth', -1)

In [40]:
df[['review_title','review_body','rating','staff_rating','atmos_rating','bud_rating']].sample(25)

Unnamed: 0,review_title,review_body,rating,staff_rating,atmos_rating,bud_rating
120997,osides best .,i wont go anywhere else in oside..staff is very cool and they got that fire for a good price..quit playin .call em..,5.0,5.0,,5.0
159595,The love is real,Live 2 Love is definitely the best of the best when its come to Oc dispensaries. The coolest and compassionate staff. The meds and deals are always amazing. You would be doing yourself wrong not to check them out. THAnks for the showing love even when I was ballin on a budget,5.0,5.0,5.0,5.0
166481,Local guy,"I've been to this dispensary a few times now and every time, I'm impressed with the staff, knowledge, product,everything is good. Thanks Keith, your always helpful and super cool. Thanks for providing your services locally. That's SUPER convenient.",5.0,5.0,5.0,5.0
107771,The only place I shop,"Seriously, it takes a killer deal to even get me THINKING of trying another shop. \nFire bud, amazing prices, and some of the nicest people you'll ever meet! \nAnd if you aren't getting their TXT deals, you're missing out!",5.0,5.0,5.0,5.0
72967,best,best n strongest and lasts long! the best from the best... cali's best!,5.0,5.0,,5.0
152418,once again,getting ready to get hooked up by my favorite people,5.0,5.0,,5.0
99750,great service,"i loved the flowers, the service is just as good. i keep coming back.",5.0,5.0,,5.0
102323,The shop with the most helpful people.,I like it cause it close and they always treat you write. I like this store cause my favorite girl is working there .,5.0,5.0,5.0,5.0
105894,,,,,,
69669,New remedy healthcare,I was a first time patient today and everything about it had good vibes a place I definitely know I will return too,5.0,5.0,5.0,5.0


## 1.Predict Rating

In [41]:
rating_df= df[['review_title','review_body','rating']]

In [42]:
rating_df.head()

Unnamed: 0,review_title,review_body,rating
0,BEST QUALITY FOR BEAUTIFUL PRICE,Best quality of bud i have ever seen every jar was filled with sticky potent weed not to mention the fire waxes! My new spot fosho,5.0
1,FTP,"Cool shop. Nice meds, prices, and staff.",5.0
2,great place,good product great staff friendly atmosphere,5.0
3,OC HOTBOX GOT ð¥,"Bomb meds and staff, headed to the hot box now and so should you!",5.0
4,Great place.,"1st time client. Easy location, great atmosphere. The staff was pleasant and I worked with Tyler who was very knowledgable about his products. I felt comfortable purchasing, definitely will return.",5.0


#### Checking null values

In [43]:
rating_df.isnull().values.any()

True

In [44]:
rating_df.shape

(173101, 3)

In [45]:
rating_df = rating_df.dropna()
rating_df.reset_index(inplace=True,drop=True)

In [46]:
rating_df.shape

(161131, 3)

In [47]:
rating_df.isnull().values.any()

False

#### Cleaning Data - Normalization, Stemming, Lematization

In [48]:
rating_df['review'] =  rating_df['review_title'] + ' ' + rating_df['review_body']
rating_df = rating_df[['review','rating']]

In [75]:
def stemSentence(sentence):
    if sentence.strip() == '':
        return ''
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        if word in set(stopwords.words('english')):
            continue
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [50]:
def lematizeSentence(sentence):
    if sentence.strip() == '':
        return ''
    lmtzr = WordNetLemmatizer()
    token_words=word_tokenize(sentence)
    token_words
    lematize_sentence=[]
    for word in token_words:
        lematize_sentence.append(lmtzr.lemmatize(word))
        lematize_sentence.append(" ")
    return "".join(lematize_sentence)

In [77]:
for index,row in rating_df.iterrows():
    rating_df.at[index,'review'] = ''.join([char.lower() if (65 <= ord(char) <= 90\
                                                  or 97 <= ord(char) <= 122 or ord(char) == 32)\
                                                  else '' for char in row['review']])
    rating_df.at[index,'review'] = stemSentence(rating_df.at[index,'review'])
    rating_df.at[index,'review'] = lematizeSentence(rating_df.at[index,'review'])

In [78]:
rating_df['review'].replace('',np.nan,inplace=True)

In [79]:
rating_df.isnull().values.any()

True

In [80]:
rating_df.dropna(inplace=True)
rating_df.reset_index(inplace=True,drop=True)

In [81]:
rating_df.shape

(161013, 2)

In [82]:
rating_df.head()

Unnamed: 0,review,rating
0,best qualiti beauti price best qualiti bud ever seen everi jar wa fill sticki potent weed mention fire wax new spot fosho,5
1,ftp cool shop nice med price staff,5
2,great place good product great staff friendli atmosph,5
3,oc hotbox got bomb med staff head hot box,5
4,great place st time client easi locat great atmosph staff wa pleasant work tyler wa veri knowledg hi product felt comfort purcha definit return,5


#### Generating Features

In [83]:
rating_df.head()

Unnamed: 0,review,rating
0,best qualiti beauti price best qualiti bud ever seen everi jar wa fill sticki potent weed mention fire wax new spot fosho,5
1,ftp cool shop nice med price staff,5
2,great place good product great staff friendli atmosph,5
3,oc hotbox got bomb med staff head hot box,5
4,great place st time client easi locat great atmosph staff wa pleasant work tyler wa veri knowledg hi product felt comfort purcha definit return,5


In [84]:
vect = TfidfVectorizer(input='content',stop_words='english',analyzer='word', ngram_range=(1,2),
                     min_df = 0, sublinear_tf=True)
X = vect.fit_transform(rating_df.review)


In [85]:
print(X.shape)

(161013, 736170)


In [86]:
rating_df['rating'] = [int(round(i)) for i in rating_df.rating]

In [87]:
rating_df['rating'].value_counts(normalize=True)

5    0.927962
4    0.032991
1    0.014955
3    0.014738
2    0.009291
0    0.000062
Name: rating, dtype: float64

#### Modelling

In [88]:
x_train, x_test, y_train, y_test = train_test_split(X, rating_df.rating, test_size=0.25, random_state = 7)
clf = RandomForestClassifier(n_estimators=10, random_state=7)

In [89]:
clf.fit(x_train,y_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=7, verbose=0,
                       warm_start=False)

#### Evaluation

In [90]:
pred = clf.predict(x_test)
print(accuracy_score(y_test, pred))

0.9308391712624833


In [91]:
conf_mat = confusion_matrix(y_test, pred)

In [92]:
conf_mat

array([[    0,     0,     0,     0,     0,     1],
       [    0,    46,     0,     5,     1,   545],
       [    0,    13,     2,     2,     2,   352],
       [    0,    11,     2,     4,     2,   530],
       [    0,     4,     1,     3,    10,  1281],
       [    0,    15,     2,     3,     9, 37408]], dtype=int64)

#### Printing features of the first row with tfidf score

In [93]:
feature_names = vect.get_feature_names()
doc = 0
feature_index = X[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

best 0.07970410879633755
qualiti 0.11397090557960086
beauti 0.11995979319184388
price 0.05871159001958856
bud 0.054714256448256886
seen 0.1212934897631402
everi 0.09284001961073453
jar 0.14034718045530062
wa 0.06275798032240881
sticki 0.14098232277653636
potent 0.12784855019374938
weed 0.08046050554007003
mention 0.1277013948637019
wax 0.09287283568798532
new 0.0877718013157464
spot 0.07442126609944962
fosho 0.2041765918769364
best qualiti 0.21355428940759535
qualiti beauti 0.2382772847157027
beauti price 0.23285478220801328
price best 0.1351084658274313
qualiti bud 0.1131714647463518
bud seen 0.20687718277090517
seen everi 0.24553720034938464
everi jar 0.21356926124807063
jar wa 0.20482071632553417
wa sticki 0.21356926124807063
sticki potent 0.2382772847157027
potent weed 0.22852499757604214
weed mention 0.24160332713054977
mention wax 0.2353961357251147
wax new 0.23285478220801328
new spot 0.13777291073557857
spot fosho 0.2653076083810577


#### Extracting features based on Count 

In [94]:
rating_df.where(rating_df.rating >= 4)['review'].sample(10)

87425     best la love custom servic bud chain sell fat great price u get u pay shout bud tender                                                             
83169     chocol tai updat high order arriv smoke metal pipe im high                                                                                         
120951    veri nice great success awesom dispensari friendli knowledg staff veri help tri sell cost got want need ill return sure                            
25395     best town great bud best staff town                                                                                                                
55141     blackberri kush omg wa yummi alway fan kush strain thi one disappoint earthi tast super relax high great anxieti heavi function function permagrin 
30794     meca best ask eddi never receiv better servic clinic thi shop best place stop dank weed good servic                                                
81252     best spot got ta come check best spot town

In [101]:
rating_df.where(rating_df.rating >= 4)

Unnamed: 0,review,rating
0,best qualiti beauti price best qualiti bud ever seen everi jar wa fill sticki potent weed mention fire wax new spot fosho,5.0
1,ftp cool shop nice med price staff,5.0
2,great place good product great staff friendli atmosph,5.0
3,oc hotbox got bomb med staff head hot box,5.0
4,great place st time client easi locat great atmosph staff wa pleasant work tyler wa veri knowledg hi product felt comfort purcha definit return,5.0
...,...,...
161008,could live would favorit place best flower edibl,5.0
161009,great bud even better servic bomb weedtea fast servic get alway brand new bomb bud tri,5.0
161010,rambo purcha last week wa bomb guy courtesi profess,5.0
161011,best bud tri fast friendli first time today wa realli impress great select flower realli good prize receiv nice gift free gram top shelf edibl candi lighter st time patient realli like reward program definit buy,5.0


In [108]:
counts = Counter()
for index,row in rating_df.where(rating_df.rating >= 4).iterrows():
    if type(row.review) == str:
        counts.update(row.review.split(' '))

In [109]:
counts.most_common(50)

[('', 154726),
 ('great', 81386),
 ('best', 66237),
 ('thi', 49912),
 ('good', 41114),
 ('place', 40858),
 ('bud', 39494),
 ('wa', 37095),
 ('servic', 36778),
 ('alway', 36559),
 ('staff', 31776),
 ('love', 30679),
 ('shop', 30374),
 ('price', 30250),
 ('veri', 25762),
 ('time', 24940),
 ('go', 22481),
 ('deal', 21674),
 ('qualiti', 21043),
 ('come', 20417),
 ('friendli', 20041),
 ('awesom', 18043),
 ('get', 17642),
 ('help', 15907),
 ('got', 15829),
 ('flower', 15754),
 ('thank', 15633),
 ('spot', 15593),
 ('deliveri', 15307),
 ('product', 15009),
 ('guy', 14537),
 ('fire', 14411),
 ('nice', 14380),
 ('med', 13831),
 ('back', 13575),
 ('recommend', 13119),
 ('amaz', 12878),
 ('select', 12394),
 ('one', 11944),
 ('like', 11921),
 ('realli', 11514),
 ('first', 11503),
 ('bomb', 11255),
 ('dispensari', 11190),
 ('custom', 11111),
 ('weed', 11032),
 ('ha', 10539),
 ('super', 10412),
 ('ive', 10195),
 ('top', 10058)]

In [111]:
negative_counts = Counter()
for index,row in rating_df.where(rating_df.rating <= 2).iterrows():
    if type(row.review) == str:
        negative_counts.update(row.review.split(' '))

In [112]:
negative_counts.most_common(50)

[('', 3914),
 ('wa', 3794),
 ('thi', 2963),
 ('time', 1829),
 ('get', 1621),
 ('place', 1576),
 ('like', 1432),
 ('dont', 1383),
 ('go', 1327),
 ('bud', 1200),
 ('price', 1116),
 ('order', 990),
 ('back', 978),
 ('got', 969),
 ('even', 925),
 ('call', 906),
 ('servic', 897),
 ('never', 879),
 ('good', 865),
 ('one', 796),
 ('weed', 775),
 ('would', 761),
 ('custom', 756),
 ('look', 754),
 ('onli', 751),
 ('patient', 731),
 ('tri', 729),
 ('veri', 718),
 ('first', 704),
 ('im', 701),
 ('come', 698),
 ('shop', 688),
 ('guy', 676),
 ('gram', 675),
 ('ask', 663),
 ('know', 653),
 ('didnt', 648),
 ('qualiti', 633),
 ('staff', 618),
 ('becau', 610),
 ('product', 608),
 ('told', 601),
 ('said', 586),
 ('review', 585),
 ('say', 583),
 ('bad', 582),
 ('want', 575),
 ('peopl', 563),
 ('wait', 533),
 ('give', 524)]

#### Extracting n-grams count

In [115]:
bigrams_counts = Counter()
trigrams_counts = Counter()
for index,row in rating_df.where(rating_df.rating >= 4).iterrows():
    if type(row.review) == str:
        token = nltk.word_tokenize(row.review)
        bigrams = ngrams(token,2)
        trigrams = ngrams(token,3)
        bigrams_counts.update(bigrams)
        trigrams_counts.update(trigrams)


In [116]:
bigrams_counts.most_common(20)

[(('thi', 'place'), 18803),
 (('love', 'thi'), 8934),
 (('custom', 'servic'), 7507),
 (('best', 'shop'), 7377),
 (('great', 'servic'), 7282),
 (('first', 'time'), 6712),
 (('great', 'price'), 6411),
 (('thi', 'shop'), 5704),
 (('best', 'bud'), 5389),
 (('great', 'bud'), 5060),
 (('great', 'deal'), 4752),
 (('bud', 'tender'), 4683),
 (('deliveri', 'servic'), 4668),
 (('great', 'place'), 4572),
 (('come', 'back'), 4545),
 (('friendli', 'staff'), 4177),
 (('highli', 'recommend'), 4146),
 (('great', 'staff'), 4049),
 (('good', 'price'), 3809),
 (('best', 'place'), 3656)]

In [117]:
trigrams_counts.most_common(20)

[(('love', 'thi', 'place'), 5766),
 (('great', 'custom', 'servic'), 2459),
 (('first', 'time', 'patient'), 1562),
 (('love', 'thi', 'shop'), 1386),
 (('thi', 'place', 'ha'), 1374),
 (('go', 'anywh', 'el'), 1336),
 (('thi', 'place', 'great'), 1093),
 (('recommend', 'thi', 'place'), 1065),
 (('great', 'servic', 'great'), 1043),
 (('great', 'bud', 'great'), 1040),
 (('best', 'deliveri', 'servic'), 998),
 (('definit', 'come', 'back'), 985),
 (('cant', 'go', 'wrong'), 934),
 (('best', 'shop', 'town'), 885),
 (('keep', 'come', 'back'), 874),
 (('best', 'shop', 'around'), 809),
 (('great', 'price', 'great'), 805),
 (('bud', 'great', 'price'), 794),
 (('thi', 'place', 'best'), 789),
 (('highli', 'recommend', 'thi'), 762)]

In [120]:
neg_bigrams_counts = Counter()
neg_trigrams_counts = Counter()
for index,row in rating_df.where(rating_df.rating <= 2).iterrows():
    if type(row.review) == str:
        token = nltk.word_tokenize(row.review)
        neg_bigrams = ngrams(token,2)
        neg_trigrams = ngrams(token,3)
        neg_bigrams_counts.update(neg_bigrams)
        neg_trigrams_counts.update(neg_trigrams)


In [121]:
neg_bigrams_counts.most_common(20)

[(('thi', 'place'), 919),
 (('first', 'time'), 362),
 (('custom', 'servic'), 324),
 (('wast', 'time'), 234),
 (('top', 'shelf'), 198),
 (('go', 'back'), 192),
 (('come', 'back'), 184),
 (('wa', 'told'), 178),
 (('bud', 'tender'), 175),
 (('thi', 'shop'), 160),
 (('dont', 'know'), 138),
 (('time', 'patient'), 137),
 (('look', 'like'), 129),
 (('dont', 'wast'), 120),
 (('call', 'back'), 117),
 (('wa', 'veri'), 115),
 (('deliveri', 'servic'), 108),
 (('place', 'order'), 106),
 (('last', 'time'), 105),
 (('never', 'go'), 105)]

In [122]:
neg_trigrams_counts.most_common(20)

[(('first', 'time', 'patient'), 121),
 (('dont', 'wast', 'time'), 84),
 (('never', 'go', 'back'), 67),
 (('thi', 'place', 'ha'), 51),
 (('wast', 'time', 'money'), 50),
 (('thi', 'place', 'suck'), 46),
 (('never', 'come', 'back'), 41),
 (('thi', 'place', 'wa'), 40),
 (('go', 'somewh', 'el'), 38),
 (('wa', 'first', 'time'), 33),
 (('recommend', 'thi', 'place'), 30),
 (('bad', 'custom', 'servic'), 29),
 (('horribl', 'custom', 'servic'), 28),
 (('dont', 'wast', 'money'), 27),
 (('thi', 'place', 'joke'), 25),
 (('time', 'patient', 'deal'), 23),
 (('get', 'free', 'gram'), 22),
 (('free', 'pre', 'roll'), 22),
 (('fuck', 'thi', 'place'), 22),
 (('wont', 'go', 'back'), 21)]