In [1]:
import os
import pandas as pd
import pyspark
from pyspark import SparkFiles
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

seed = 100
filePath = "yelp_dataset/review_cleaned.csv"
sc = SparkContext()

In [2]:
dfPandas = pd.read_csv(filePath, index_col=0)
dfPandas.head()

  mask |= (ar1 == a)


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
2144719,qEHYU_tm4YM04a0586UPvw,0.0,2015-01-01 00:00:30,0.0,hNwleSHcvNksIQT31569Yg,5.0,Window washing inside and out done after many ...,5.0,9jdES117z1Dat4aJuGZs5w
2144722,XcWlBj5oQgzKhR7Cxovj3w,0.0,2015-01-01 00:02:20,0.0,DfZGAhAkPMJYDdXGRjhw8A,2.0,"I will admit, I do not have high expectations ...",0.0,UaUVIQweBNlE_tVBCZjYdA
2144725,O-uIEuv7JLUHajkemx_sVw,0.0,2015-01-01 00:02:35,1.0,65viXwIysYSxEyPZgRSVbQ,1.0,The chicken curry I got was extremely dry. Des...,0.0,mq5rKhLMHLbUaBeZY8mY8Q
2144726,nqgeTj6bfIMY0v2J-vZa8A,0.0,2015-01-01 00:02:42,0.0,hS3phsfoP-fAZVlMomx4Kg,5.0,Really took care of me on my trip out of state...,0.0,-ELGAON2OCSBBIbGKNiYGQ
2144728,kd1NhNWvWo5AhBUSaGeSiw,1.0,2015-01-01 00:04:13,0.0,eaDgBBrOtvFUto5pADZwQA,5.0,Gary and Chester run this 2 year old Church St...,1.0,tYxumQ3zkWje5X14LTDpcA


In [3]:
dfPandas.shape

(3045862, 9)

In [6]:
print("NaNs: \n", dfPandas.isnull().sum())
dfPandas.reset_index(drop=True, inplace=True)
dfPandas.loc[dfPandas.isnull().any(axis=1)]

NaNs: 
 business_id    0
cool           0
date           0
funny          0
review_id      0
stars          0
text           0
useful         0
user_id        0
dtype: int64


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05)
count_vectorizer.fit(dfPandas['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.95, max_features=None, min_df=0.05,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(count_vectorizer.transform(dfPandas['text']))

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [9]:
count_vectorizer.get_feature_names()

['10',
 'about',
 'after',
 'again',
 'all',
 'also',
 'always',
 'am',
 'amazing',
 'an',
 'and',
 'another',
 'any',
 'are',
 'area',
 'around',
 'as',
 'asked',
 'at',
 'away',
 'awesome',
 'back',
 'bad',
 'bar',
 'be',
 'because',
 'been',
 'before',
 'being',
 'best',
 'better',
 'big',
 'bit',
 'both',
 'but',
 'by',
 'came',
 'can',
 'check',
 'cheese',
 'chicken',
 'clean',
 'come',
 'coming',
 'could',
 'customer',
 'day',
 'definitely',
 'delicious',
 'did',
 'didn',
 'different',
 'dinner',
 'do',
 'don',
 'done',
 'down',
 'drinks',
 'eat',
 'enough',
 'even',
 'ever',
 'every',
 'everything',
 'excellent',
 'experience',
 'favorite',
 'feel',
 'few',
 'find',
 'first',
 'food',
 'for',
 'found',
 'fresh',
 'friendly',
 'from',
 'get',
 'give',
 'go',
 'going',
 'good',
 'got',
 'great',
 'had',
 'happy',
 'has',
 'have',
 'he',
 'her',
 'here',
 'highly',
 'his',
 'home',
 'hot',
 'how',
 'if',
 'in',
 'into',
 'is',
 'it',
 'just',
 'know',
 'last',
 'like',
 'little',
 

In [10]:
tfidf = tfidf_transformer.transform(count_vectorizer.transform(dfPandas['text']))

In [11]:
tfidf

<3045862x233 sparse matrix of type '<class 'numpy.float64'>'
	with 110621481 stored elements in Compressed Sparse Row format>

In [12]:
import pickle

pickle.dump(count_vectorizer, open('yelp_dataset/countVectorizer.pkl', 'wb'))
pickle.dump(tfidf_transformer, open('yelp_dataset/tfidfTransformer.pkl', 'wb'))

In [13]:
import scipy

scipy.sparse.save_npz('yelp_dataset/textTransform.npz', tfidf)

In [14]:
scoreMatrix = pd.DataFrame(
    data=tfidf.toarray(),
    columns=count_vectorizer.get_feature_names(),
)

In [15]:
scoreMatrix.head(1)

Unnamed: 0,10,about,after,again,all,also,always,am,amazing,an,...,while,who,will,with,work,worth,would,years,you,your
0,0.0,0.0,0.262632,0.0,0.201926,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.314621,0.0,0.0,0.341729,0.0,0.0


In [16]:
scoreMatrix.to_csv('yelp_dataset/textTransformPandas.csv', index=False)

In [2]:
import pandas as pd
ratings = pd.read_csv('yelp_dataset/yelp_ratings.csv')
ratings.head()

Unnamed: 0,business_id,user_id,stars,date,TrainTest
0,qEHYU_tm4YM04a0586UPvw,9jdES117z1Dat4aJuGZs5w,5.0,2015-01-01 00:00:30,1
1,XcWlBj5oQgzKhR7Cxovj3w,UaUVIQweBNlE_tVBCZjYdA,2.0,2015-01-01 00:02:20,0
2,O-uIEuv7JLUHajkemx_sVw,mq5rKhLMHLbUaBeZY8mY8Q,1.0,2015-01-01 00:02:35,1
3,nqgeTj6bfIMY0v2J-vZa8A,-ELGAON2OCSBBIbGKNiYGQ,5.0,2015-01-01 00:02:42,0
4,kd1NhNWvWo5AhBUSaGeSiw,tYxumQ3zkWje5X14LTDpcA,5.0,2015-01-01 00:04:13,0


In [3]:
ratings.drop(['date', 'TrainTest'], axis=1, inplace=True)

In [4]:
print(ratings.shape)
print(ratings.drop_duplicates().shape)

(3045862, 3)
(3045862, 3)


In [5]:
import os
import pyspark
from pyspark import SparkFiles
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

seed = 100
sc = SparkContext()

In [6]:
filePath = "yelp_dataset/yelp_ratings.csv"

In [7]:
# read dataset into spark RDD
sc.addFile(filePath)
sqlContext = SQLContext(sc)
df = sqlContext.read.csv(
    SparkFiles.get("yelp_ratings.csv"), 
    header=True, 
    inferSchema=True
)

sqlContext.registerDataFrameAsTable(df, "df")
df = sqlContext.sql('''
    SELECT *
    FROM df
''')

In [8]:
df.show(n=10)

+--------------------+--------------------+-----+-------------------+---------+
|         business_id|             user_id|stars|               date|TrainTest|
+--------------------+--------------------+-----+-------------------+---------+
|qEHYU_tm4YM04a058...|9jdES117z1Dat4aJu...|  5.0|2015-01-01 00:00:30|        1|
|XcWlBj5oQgzKhR7Cx...|UaUVIQweBNlE_tVBC...|  2.0|2015-01-01 00:02:20|        0|
|O-uIEuv7JLUHajkem...|mq5rKhLMHLbUaBeZY...|  1.0|2015-01-01 00:02:35|        1|
|nqgeTj6bfIMY0v2J-...|-ELGAON2OCSBBIbGK...|  5.0|2015-01-01 00:02:42|        0|
|kd1NhNWvWo5AhBUSa...|tYxumQ3zkWje5X14L...|  5.0|2015-01-01 00:04:13|        0|
|JyxHvtj-syke7m9rb...|fS8z1BsG6s26wiPWF...|  3.0|2015-01-01 00:05:18|        0|
|A029GQG1S3ekPit6c...|iWEruF6zWqoVWZ1ip...|  5.0|2015-01-01 00:06:22|        1|
|KmsQdsAzOptMg9W7Z...|esmTJ_wex9xzYHCbW...|  5.0|2015-01-01 00:06:43|        1|
|yEyA7uILKG97qnyx3...|N2F0ZsiSMtNm9-NBW...|  2.0|2015-01-01 00:06:52|        0|
|ahSFUPojs9X3-1jP-...|Uwu6MCuv_YIxHL0kD.

In [9]:
df.dtypes

[('business_id', 'string'),
 ('user_id', 'string'),
 ('stars', 'double'),
 ('date', 'timestamp'),
 ('TrainTest', 'int')]

In [10]:
dftrain = df.where(col('TrainTest') == 1)
dftest = df.where(col('TrainTest') == 0)

In [11]:
dftrain.take(3)

[Row(business_id='qEHYU_tm4YM04a0586UPvw', user_id='9jdES117z1Dat4aJuGZs5w', stars=5.0, date=datetime.datetime(2015, 1, 1, 0, 0, 30), TrainTest=1),
 Row(business_id='O-uIEuv7JLUHajkemx_sVw', user_id='mq5rKhLMHLbUaBeZY8mY8Q', stars=1.0, date=datetime.datetime(2015, 1, 1, 0, 2, 35), TrainTest=1),
 Row(business_id='A029GQG1S3ekPit6cObcBA', user_id='iWEruF6zWqoVWZ1ipWRVJQ', stars=5.0, date=datetime.datetime(2015, 1, 1, 0, 6, 22), TrainTest=1)]

In [12]:
dftest.take(3)

[Row(business_id='XcWlBj5oQgzKhR7Cxovj3w', user_id='UaUVIQweBNlE_tVBCZjYdA', stars=2.0, date=datetime.datetime(2015, 1, 1, 0, 2, 20), TrainTest=0),
 Row(business_id='nqgeTj6bfIMY0v2J-vZa8A', user_id='-ELGAON2OCSBBIbGKNiYGQ', stars=5.0, date=datetime.datetime(2015, 1, 1, 0, 2, 42), TrainTest=0),
 Row(business_id='kd1NhNWvWo5AhBUSaGeSiw', user_id='tYxumQ3zkWje5X14LTDpcA', stars=5.0, date=datetime.datetime(2015, 1, 1, 0, 4, 13), TrainTest=0)]

In [13]:
import math
import operator
# using average rate as Baseline model
meanRating = df.rdd.map(lambda x: x[2]).mean()
baselineRmse = math.sqrt(
    dftest.rdd.map(lambda x: (meanRating - x[2]) ** 2).reduce(operator.add) / dftest.count()
)
print("Baseline Model (Rating Average for all users and movies) Performance on Test Set")
print("baseline performance on test set: ", baselineRmse)

Baseline Model (Rating Average for all users and movies) Performance on Test Set
baseline performance on test set:  1.5670219328182322
