## Machine Learning to predict a new venue's popularity (star ratings) when it opens using the data available from yelp's existing venue ratings.

In [2]:
import pandas as pd
import numpy as np
import ujson

In [3]:
data = []
with open('yelp_train_academic_dataset_business.json','rb') as yelp:
    for ye in yelp:
        data.append(ujson.loads(ye))

In [6]:
df=pd.DataFrame(data)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37938 entries, 0 to 37937
Data columns (total 15 columns):
attributes       37938 non-null object
business_id      37938 non-null object
categories       37938 non-null object
city             37938 non-null object
full_address     37938 non-null object
hours            37938 non-null object
latitude         37938 non-null float64
longitude        37938 non-null float64
name             37938 non-null object
neighborhoods    37938 non-null object
open             37938 non-null bool
review_count     37938 non-null int64
stars            37938 non-null float64
state            37938 non-null object
type             37938 non-null object
dtypes: bool(1), float64(3), int64(1), object(10)
memory usage: 4.1+ MB


In [8]:
from sklearn.model_selection import train_test_split

X = df.drop('stars',axis=1)
y= df.stars

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [10]:
train = X_train.join(y_train)
test = X_test.join(y_test)

In [16]:
train.groupby('city')['stars'].mean()

city
Ahwatukee               3.571429
Anthem                  3.804348
Apache Junction         3.610169
Arcadia                 5.000000
Atlanta                 3.500000
Avondale                3.522727
Black Canyon City       3.000000
Bonnyrigg               3.750000
Boulder City            4.035714
Buckeye                 3.491525
C Las Vegas             3.000000
Cambridge               4.166667
Carefree                3.760000
Casa Grande             3.567568
Cave Creek              3.939252
Centennial Hills        3.000000
Central City Village    3.500000
Central Henderson       3.500000
Chandler                3.682239
Chandler-Gilbert        5.000000
Clark County            3.500000
Coolidge                3.571429
Cottage Grove           3.000000
Cramond                 3.500000
Dalkeith                4.250000
Dane                    4.500000
De Forest               3.700000
DeForest                4.100000
Deforest                5.000000
Eagan                   5.000000
     

### Custom Estimator to predict star ratings using average rating of a city

In [17]:
from sklearn.base import BaseEstimator, RegressorMixin

class cityModel(BaseEstimator,RegressorMixin):
    def __init__(self):
        pass
    def fit(self,X,y):
        self.data = pd.DataFrame({'city':X,'stars':y})
        self.avg = self.data.groupby('city')['stars'].mean()
        return self
    def predict(self,X):
        return self.avg.loc[X] if X in self.avg.index.values else 2.8

In [25]:
model =cityModel()
model.fit(train.city,train.stars)

cityModel()

In [26]:
print model.predict('Delhi')
print model.predict('Redmond')

2.8
2.8


### Custom transformer to select relevant columns in order to predict star ratings using neighborhood dynamics

In [51]:
class latlongModel(BaseEstimator, TransformerMixin):
    def __init__(self,column1,column2):
        self.column1 = column1
        self.column2 = column2
        
    def fit(self, X, y=None):
        return self
        
    def transform(self,X):
        self.result = zip(X[self.column1],X[self.column2])
        return self.result

In [52]:
#transformer = latlongModel('latitude','longitude')
#X_train_trans=transformer.fit_transform(X_train)

In [137]:
#print X_train_trans

In [54]:
# Fitting a pipeline for transforming the data and fitting the model
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor

reg = Pipeline([('trans',latlongModel('latitude','longitude')),
               ('estimator',KNeighborsRegressor(n_neighbors=3))])

reg.fit(X_train,y_train)

Pipeline(steps=[('trans', latlongModel(column1='latitude', column2='longitude')), ('estimator', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='uniform'))])

In [55]:
reg.predict(X_test)

array([ 3.83333333,  3.33333333,  3.5       , ...,  3.5       ,
        4.33333333,  2.66666667])

In [56]:
reg.score(X_test,y_test)

-0.21053764500254091

In [57]:
reg.get_params().keys()

['estimator__metric_params',
 'estimator__p',
 'trans__column1',
 'trans__column2',
 'estimator__metric',
 'estimator__leaf_size',
 'estimator__n_jobs',
 'estimator',
 'steps',
 'estimator__weights',
 'estimator__n_neighbors',
 'trans',
 'estimator__algorithm']

In [58]:
# Hyper parameter tuning to predict star ratings using neighborhood dynamics.

param_grid = {'estimator__n_neighbors': (5,7,10)}

pipe = Pipeline ([
    ('transformer',latlongModel('latitude','longitude')),
    ('estimator', KNeighborsRegressor())
])


from sklearn.grid_search import GridSearchCV


grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('transformer', latlongModel(column1='latitude', column2='longitude')), ('estimator', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'estimator__n_neighbors': (5, 7, 10)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [59]:
grid.predict(X_test)

array([ 3.65,  3.5 ,  3.85, ...,  3.9 ,  3.85,  3.65])

In [60]:
print grid.best_params_

{'estimator__n_neighbors': 10}


In [61]:
print grid.best_estimator_

Pipeline(steps=[('transformer', latlongModel(column1='latitude', column2='longitude')), ('estimator', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=10, p=2,
          weights='uniform'))])


In [62]:
print grid.best_score_

-0.0413267365233


### Custom transformer to perform feature extraction on venue category in order to predict star ratings

In [None]:
'''
result =[]
value = 1
count =0
for row in X_train['categories']:
    d = dict()
    for i in range(len(row)):
        count =+1
        d.update({row[i]:value})
        print d
    result.append(d)    
'''

In [98]:
class categoryModel(BaseEstimator, TransformerMixin):
    def __init__(self,column):
        self.column = column
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        self.result =[]
        value = 1
        for row in X[self.column]:
            d = dict()
            for i in range(len(row)):
                d.update({row[i]:value})
            self.result.append(d)
        return self.result

In [102]:
#model = categoryModel('categories')
#result=model.fit_transform(X_train)

In [103]:
from sklearn.feature_extraction import DictVectorizer

#v = DictVectorizer(sparse=False)
#X = v.fit_transform(result)


In [104]:
#X

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [105]:
# Validating the shape of X
#all_keys = set().union(*(d.keys() for d in result))

#print "No. of rows in the sparse vector : ", X_train.shape[0]
#print "No. of columns in the sparse vector : ", len(all_keys)
#print "Shape of sparse vector :", X.shape

No. of rows in the sparse vector :  28453
No. of columns in the sparse vector :  694
Shape of sparse vector : (28453, 694)


In [107]:
# Pipeline to custom fit and transform the venue category, generate sparse feature vector and fit a regularized linear model

from sklearn.linear_model import Lasso
from sklearn.feature_extraction import DictVectorizer

pipeline = Pipeline([('trans',categoryModel('categories')),
                     ('vect',DictVectorizer(sparse=False)),
                     ('lasso', Lasso()),
])

pipeline.fit(X_train,y_train)

Pipeline(steps=[('trans', categoryModel(column='categories')), ('vect', DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [108]:
pipeline.predict(X_test)

array([ 3.67335606,  3.67335606,  3.67335606, ...,  3.67335606,
        3.67335606,  3.67335606])

In [109]:
pipeline.score(X_test,y_test)

-3.8927106189579064e-06

### Custom transformer to flatten venue's json format attribute in order to perform feature extraction and prediction of star rating

In [115]:
class knnModel(BaseEstimator,TransformerMixin):
    def __init__(self,column):
        self.column = column
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        
        def flatten_json(y):
            out = {}

            def flatten(x, name=''):
                        if type(x) is dict:
                            for a in x:
                                flatten(x[a], name + a + '_')
                        elif type(x) is list:
                            i = 0
                            for a in x:
                                flatten(a, name + str(i) + '_')
                                i += 1
                        elif type(x) is unicode:
                             flatten(1, name + str(x) + '_')

                        elif type(x) is bool:
                             flatten(int(x), name)
                        else:
                            out[name[:-1]] = x

            flatten(y)
            return out
    
        self.attrlist = []
        self.attr = X[self.column].tolist()
        
        for attr in self.attr:
            flat = flatten_json(attr)
            self.attrlist.append(flat)
            
        return self.attrlist

In [139]:
##knnmodel = knnModel('attributes')
##knnmodel.fit_transform(X_train)

In [138]:
##for row in X_train.attributes:
   ## print row

In [127]:
# Piple line to implement custom transformer to flatten the json column, dictvectorizer to generate sparse vector and estimator to fit a regularized linear model.

from sklearn.linear_model import LinearRegression, Lasso
pipl = Pipeline([
    ('trans',knnModel('attributes')),
    ('vect',DictVectorizer(sparse=False)),
    #('linreg',LinearRegression())
    ('lasso',Lasso())
])

pipl.fit(X_train,y_train)

Pipeline(steps=[('trans', knnModel(column='attributes')), ('vect', DictVectorizer(dtype=<type 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [128]:
pipl.predict(X_test)

array([ 3.67335606,  3.67335606,  3.67335606, ...,  3.67335606,
        3.67335606,  3.67335606])

In [129]:
pipl.score(X_test,y_test)

-3.8927106189579064e-06

### Pipeline to implement full model using Features Union to predict star ratings

In [131]:
from sklearn.pipeline import FeatureUnion


class fullModel(BaseEstimator, RegressorMixin):
    def __init__(self,model):
        self.model = model
        
    def fit(self,X,y):
        return self.model.fit(X,y)
    
    def transform(self,X):
        return [ [value] for value in self.model.predict(X)]
        


In [133]:
allmodel = FeatureUnion([
    ('latlong',Pipeline([
        ('transformer',latlongModel('latitude','longitude')),
        ('estimator', fullModel(KNeighborsRegressor())),
    ])),
    
    ('category',Pipeline([
        ('transformer',categoryModel('categories')),
        ('Vectorizer',DictVectorizer(sparse=False)),
        ('estimator',fullModel(Lasso())),
    ])),
    
    ('attribute',Pipeline([
        ('transformer',knnModel('attributes')),
        ('Vectorizer',DictVectorizer(sparse=False)),
        ('estimator',fullModel(LinearRegression()))
    ])),
])

In [134]:
allmodel.fit(X_train,y_train)

FeatureUnion(n_jobs=1,
       transformer_list=[('latlong', Pipeline(steps=[('transformer', latlongModel(column1='latitude', column2='longitude')), ('estimator', fullModel(model=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='u..., fullModel(model=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)))]))],
       transformer_weights=None)

In [136]:
# Predictions are peformed using transform method defined in fullModel transformer
allmodel.transform(X_test) 

array([[ 4.1       ,  3.67335606,  3.55464887],
       [ 3.2       ,  3.67335606,  3.33948278],
       [ 3.7       ,  3.67335606,  3.55994466],
       ..., 
       [ 3.8       ,  3.67335606,  2.95496551],
       [ 4.        ,  3.67335606,  3.24125399],
       [ 3.        ,  3.67335606,  3.66016412]])