In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split
from textblob import TextBlob,Word
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd



In [8]:
train = pd.read_csv('files/restaurant_review1.csv', encoding = "ISO-8859-1")

In [9]:
train=train.dropna(axis=0)

In [10]:
def stemming_tokenizer(text):
    stopwords = []
    with open('files/new_sw.txt','r+') as f:
        stopword = f.readlines()
        for sw in stopword:
            sw = sw[:-1]
            stopwords.append(sw)
    f.close()
    text = text.lower()
    words = TextBlob(text).correct().words
    words = [Word(w).lemmatize("v") for w in words if not w in stopwords]
    return words

## Classifier for food

In [31]:
food_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Food'], test_size=0.25, random_state=33)

In [37]:
classifier=food_classifier.fit(X_train, y_train)
print("Accuracy: %s" % classifier.score(X_test, y_test))

Accuracy: 0.697674418605


## Classifier for service

In [38]:
service_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Service'], test_size=0.25, random_state=33)

classifier=service_classifier.fit(X_train, y_train)
print("Accuracy: %s" % classifier.score(X_test, y_test))


Accuracy: 0.697674418605


## Classifier for ambience

In [39]:
amb_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Ambience'], test_size=0.25, random_state=33)

classifier=amb_classifier.fit(X_train, y_train)
print("Accuracy: %s" % classifier.score(X_test, y_test))


Accuracy: 0.720930232558


## Classifier for deals

In [40]:
deal_classifier = Pipeline([
        ('vectorizer', CountVectorizer(tokenizer=stemming_tokenizer,ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('classifier', KNeighborsClassifier(n_neighbors=5)),
        ])

X_train, X_test, y_train, y_test = train_test_split(train['review'],train['deal'], test_size=0.25, random_state=33)

classifier=deal_classifier.fit(X_train, y_train)

print("Accuracy: %s" % classifier.score(X_test, y_test))


Accuracy: 0.860465116279


## get reviews from a restaurant with id  = res_id

In [44]:
df = pd.read_csv('files/reviews.csv', encoding = "ISO-8859-1")

In [119]:
df.head()

Unnamed: 0,funny,rating,user_id,review,restaurant_id,review_id,date,cool,useful
0,0,5,24538,My wife took me here on my birthday for breakf...,3010,1,2011-01-26,2,5
1,0,5,40413,I have no idea why some people give bad review...,1191,2,2011-07-27,0,0
2,0,4,36383,love the gyro plate. Rice is so good and I als...,1989,3,2012-06-14,0,1
3,1,4,25790,"Quiessence is, simply put, beautiful. Full wi...",3566,4,2007-12-13,4,3
4,4,5,16256,Drop what you're doing and drive here. After I...,1019,5,2010-02-12,7,7


In [120]:
def getReview(res_id):
    reviews = []
    
    rev = df[(df['restaurant_id']==res_id) & (df['useful']>0)]['review']
    for r in rev:
        reviews.append(r)
    return reviews

In [46]:
rev = getReview(3)

In [53]:
pdf= pd.read_csv('files/restaurants.csv',encoding = "ISO-8859-1")

In [56]:
pred= pdf[['new_id']]

In [63]:
pred.columns=['restaurant_id']

In [64]:
pred['food']=-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [65]:
pred['service']=-1
pred['ambience']=-1
pred['deals']=-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [3]:
import pickle

In [136]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [144]:
save_obj(food_classifier, 'food_classifier')
save_obj(service_classifier, 'service_classifier')
save_obj(amb_classifier, 'amb_classifier')
save_obj(deal_classifier, 'deal_classifier')

In [5]:
def load_obj(name ):
    with open( name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [11]:
f= load_obj('food_classifier')
s= load_obj('service_classifier')
a= load_obj('amb_classifier')
d= load_obj('deal_classifier')

In [25]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [27]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Food'], test_size=0.25, random_state=33)
print(classification_report(y_test,f.predict(X_test)))

             precision    recall  f1-score   support

        0.0       0.75      0.35      0.48        17
        1.0       0.69      0.92      0.79        26

avg / total       0.71      0.70      0.67        43



In [29]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Service'], test_size=0.25, random_state=33)
print(classification_report(y_test,s.predict(X_test)))

             precision    recall  f1-score   support

        0.0       0.76      0.83      0.79        30
        1.0       0.50      0.38      0.43        13

avg / total       0.68      0.70      0.69        43



In [30]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['Ambience'], test_size=0.25, random_state=33)
print(classification_report(y_test,a.predict(X_test)))

             precision    recall  f1-score   support

        0.0       0.70      1.00      0.82        28
        1.0       1.00      0.20      0.33        15

avg / total       0.80      0.72      0.65        43



In [31]:
X_train, X_test, y_train, y_test = train_test_split(train['review'],train['deal'], test_size=0.25, random_state=33)
print(classification_report(y_test,d.predict(X_test)))

             precision    recall  f1-score   support

        0.0       0.85      1.00      0.92        35
        1.0       1.00      0.25      0.40         8

avg / total       0.88      0.86      0.82        43



In [132]:
def getPrediction(id):
    rev= getReview(id)
    food_count = 0 
    serv_count = 0
    amb_count = 0
    deal_count = 0

    for review in rev:
        prediction = {}
        prediction["food"] = food_classifier.predict([review])[0]
        prediction["service"] = service_classifier.predict([review])[0]
        prediction["amb"] = amb_classifier.predict([review])[0]
        prediction["deals"] = deal_classifier.predict([review])[0]

        if(prediction["food"]==1):
            food_count += 1
        if(prediction["service"]==1):
            serv_count += 1
        if(prediction["amb"]==1):
            amb_count += 1
        if(prediction["deals"]==1):
            deal_count += 1

    #     print(prediction)

    rev_size = len(rev)
    if(rev_size> 0 ):
        food_per = food_count/rev_size*100
        serv_per = serv_count/rev_size*100
        amb_per = amb_count/rev_size*100
        deal_per = deal_count/rev_size*100
    else:
        food_per= 0 
        serv_per=0
        amb_per=0
        deal_per=0 

    # print(rev_size,food_per,serv_per,amb_per,deal_per)
    return (rev_size,food_per,serv_per,amb_per,deal_per)



In [127]:
for id in pred[pred['food']==-1].restaurant_id:
    i,a,b,c,d= getPrediction(id)
    pred.iloc[id-1,1]= a
    pred.iloc[id-1,2]= b 
    pred.iloc[id-1,3]= c 
    pred.iloc[id-1,4]= d 
    print(i)

KeyboardInterrupt: 

In [125]:
rev = getReview(96)

In [126]:
len(rev)

244

In [131]:
pred.head()

Unnamed: 0,restaurant_id,food,service,ambience,deals
0,1,100.0,23.076923,0.0,0.0
1,2,62.5,25.0,12.5,0.0
2,3,100.0,25.0,0.0,0.0
3,4,100.0,4.0,6.0,0.0
4,5,90.909091,31.818182,4.545455,0.0


In [130]:
pred[(pred['food']==-1)]

Unnamed: 0,restaurant_id,food,service,ambience,deals
95,96,-1.0,-1.0,-1.0,-1.0
96,97,-1.0,-1.0,-1.0,-1.0
97,98,-1.0,-1.0,-1.0,-1.0
98,99,-1.0,-1.0,-1.0,-1.0
99,100,-1.0,-1.0,-1.0,-1.0
100,101,-1.0,-1.0,-1.0,-1.0
101,102,-1.0,-1.0,-1.0,-1.0
102,103,-1.0,-1.0,-1.0,-1.0
103,104,-1.0,-1.0,-1.0,-1.0
104,105,-1.0,-1.0,-1.0,-1.0


In [129]:
pred.to_csv("prediction.csv", index=False)

In [149]:
pred[(pred['food']==-1)& (pred['restaurant_id']>=1000)]

Unnamed: 0,restaurant_id,food,service,ambience,deals
999,1000,-1.0,-1.0,-1.0,-1.0
1000,1001,-1.0,-1.0,-1.0,-1.0
1001,1002,-1.0,-1.0,-1.0,-1.0
1002,1003,-1.0,-1.0,-1.0,-1.0
1003,1004,-1.0,-1.0,-1.0,-1.0
1004,1005,-1.0,-1.0,-1.0,-1.0
1005,1006,-1.0,-1.0,-1.0,-1.0
1006,1007,-1.0,-1.0,-1.0,-1.0
1007,1008,-1.0,-1.0,-1.0,-1.0
1008,1009,-1.0,-1.0,-1.0,-1.0
