In [1]:
import os
import pandas as pd
import pyspark
from pyspark import SparkFiles
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

seed = 100
filePath = "yelp_dataset/review_cleaned.csv"
sc = SparkContext()

In [4]:
dfPandas = pd.read_csv(filePath, index_col=0)
dfPandas.head()

  mask |= (ar1 == a)


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
1,NZnhc2sEQy3RmzKTZnqtwQ,0.0,2017-01-14 21:30:33,0.0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0.0,yXQM5uF2jS6es16SJzNHfg
2,WTqjgwHlXbSFevF32_DJVw,0.0,2016-11-09 20:09:03,0.0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3.0,n6-Gk65cPZL6Uz8qRm3NYw
6,3fw2X5bZYeW9xCz_zGhOHg,5.0,2016-05-07 01:21:02,4.0,G7XHMxG0bx9oBJNECG4IFg,3.0,Tracy dessert had a big name in Hong Kong and ...,5.0,jlu4CztcSxrKx56ba1a5AQ
15,YvrylyuWgbP90RgMqZQVnQ,0.0,2017-04-07 21:27:49,0.0,svK3nBU7Rk8VfGorlrN52A,5.0,You can't really find anything wrong with this...,0.0,NJlxGtouq06hhC7sS2ECYw
16,NyLYY8q1-H3hfsTwuwLPCg,0.0,2015-01-03 22:47:34,0.0,1wVA2-vQIuW_ClmXkDxqMQ,4.0,Great lunch today. Staff was very helpful in a...,0.0,86J5DwcFk4f4In1Vxe2TvA


In [5]:
dfPandas.shape

(3148043, 9)

In [6]:
print("NaNs: ", dfPandas['text'].isnull().sum())
dfPandas.loc[dfPandas.isnull().any(axis=1)]

NaNs:  0


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05)
count_vectorizer.fit(dfPandas['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.95, max_features=None, min_df=0.05,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(count_vectorizer.transform(dfPandas['text']))

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [9]:
count_vectorizer.get_feature_names()

['10',
 'about',
 'after',
 'again',
 'all',
 'also',
 'always',
 'am',
 'amazing',
 'an',
 'and',
 'another',
 'any',
 'are',
 'area',
 'around',
 'as',
 'asked',
 'at',
 'away',
 'awesome',
 'back',
 'bad',
 'bar',
 'be',
 'because',
 'been',
 'before',
 'being',
 'best',
 'better',
 'big',
 'bit',
 'both',
 'busy',
 'but',
 'by',
 'came',
 'can',
 'check',
 'cheese',
 'chicken',
 'clean',
 'come',
 'coming',
 'could',
 'customer',
 'day',
 'definitely',
 'delicious',
 'did',
 'didn',
 'different',
 'dinner',
 'do',
 'don',
 'done',
 'down',
 'drinks',
 'eat',
 'enough',
 'even',
 'ever',
 'every',
 'everything',
 'excellent',
 'experience',
 'favorite',
 'feel',
 'few',
 'find',
 'first',
 'food',
 'for',
 'found',
 'fresh',
 'friendly',
 'from',
 'get',
 'give',
 'go',
 'going',
 'good',
 'got',
 'great',
 'had',
 'happy',
 'has',
 'have',
 'he',
 'her',
 'here',
 'highly',
 'his',
 'home',
 'hot',
 'how',
 'if',
 'in',
 'into',
 'is',
 'it',
 'just',
 'know',
 'last',
 'like',
 'l

In [10]:
tfidf = tfidf_transformer.transform(count_vectorizer.transform(dfPandas['text']))

In [12]:
tfidf

<3148043x234 sparse matrix of type '<class 'numpy.float64'>'
	with 114857471 stored elements in Compressed Sparse Row format>

In [11]:
import pickle

pickle.dump(count_vectorizer, open('yelp_dataset/countVectorizer.pkl', 'wb'))
pickle.dump(tfidf_transformer, open('yelp_dataset/tfidfTransformer.pkl', 'wb'))

In [13]:
import scipy

scipy.sparse.save_npz('yelp_dataset/textTransform.npz', tfidf)

In [14]:
scoreMatrix = pd.DataFrame(
    data=tfidf.toarray(),
    columns=count_vectorizer.get_feature_names(),
)

In [15]:
scoreMatrix.head(1)

Unnamed: 0,10,about,after,again,all,also,always,am,amazing,an,...,while,who,will,with,work,worth,would,years,you,your
0,0.0,0.0,0.0,0.0,0.051035,0.06043,0.06782,0.0,0.0,0.0,...,0.0,0.0,0.055916,0.115414,0.0,0.082211,0.0,0.0,0.16944,0.0


In [16]:
scoreMatrix.to_csv('yelp_dataset/textTransformPandas.csv', index=False)

In [1]:
import os
import pyspark
from pyspark import SparkFiles
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

seed = 100
sc = SparkContext()

In [2]:
filePath = "yelp_dataset/yelp_ratings.csv"

In [6]:
# read dataset into spark RDD
sc.addFile(filePath)
sqlContext = SQLContext(sc)
df = sqlContext.read.csv(
    SparkFiles.get("yelp_ratings.csv"), 
    header=True, 
    inferSchema=True
)

sqlContext.registerDataFrameAsTable(df, "df")
df = sqlContext.sql('''
    SELECT *
    FROM df
''')

In [7]:
df.show(n=10)

+--------------------+--------------------+-----+-------------------+---------+
|         business_id|             user_id|stars|               date|TrainTest|
+--------------------+--------------------+-----+-------------------+---------+
|NZnhc2sEQy3RmzKTZ...|yXQM5uF2jS6es16SJ...|  5.0|2017-01-14 21:30:33|        0|
|WTqjgwHlXbSFevF32...|n6-Gk65cPZL6Uz8qR...|  5.0|2016-11-09 20:09:03|        1|
|3fw2X5bZYeW9xCz_z...|jlu4CztcSxrKx56ba...|  3.0|2016-05-07 01:21:02|        1|
|YvrylyuWgbP90RgMq...|NJlxGtouq06hhC7sS...|  5.0|2017-04-07 21:27:49|        1|
|NyLYY8q1-H3hfsTwu...|86J5DwcFk4f4In1Vx...|  4.0|2015-01-03 22:47:34|        1|
|cHdJXLlKNWixBXpDw...|JSrP-dUmLlwZiI7Dp...|  3.0|2015-04-01 16:30:00|        1|
|6lj2BJ4tJeu7db5as...|6Fz_nus_OG4gar721...|  5.0|2017-05-26 01:23:19|        1|
|qx6WhZ42eDKmBchZD...|DzZ7piLBF-WsJxqos...|  5.0|2017-03-27 01:14:37|        0|
|Mem13A3C202RzT53n...|5JVY32_bmTBfIGpCC...|  5.0|2017-05-13 10:41:43|        1|
|I4Nr-MVc26qWr08-S...|3CJUJILq7CLHk_9Or.

In [8]:
df.dtypes

[('business_id', 'string'),
 ('user_id', 'string'),
 ('stars', 'double'),
 ('date', 'timestamp'),
 ('TrainTest', 'int')]

In [9]:
dftrain = df.where(col('TrainTest') == 1)
dftest = df.where(col('TrainTest') == 0)

In [13]:
dftrain.take(3)

[Row(business_id='WTqjgwHlXbSFevF32_DJVw', user_id='n6-Gk65cPZL6Uz8qRm3NYw', stars=5.0, date=datetime.datetime(2016, 11, 9, 20, 9, 3), TrainTest=1),
 Row(business_id='3fw2X5bZYeW9xCz_zGhOHg', user_id='jlu4CztcSxrKx56ba1a5AQ', stars=3.0, date=datetime.datetime(2016, 5, 7, 1, 21, 2), TrainTest=1),
 Row(business_id='YvrylyuWgbP90RgMqZQVnQ', user_id='NJlxGtouq06hhC7sS2ECYw', stars=5.0, date=datetime.datetime(2017, 4, 7, 21, 27, 49), TrainTest=1)]

In [11]:
dftest.take(3)

[Row(business_id='NZnhc2sEQy3RmzKTZnqtwQ', user_id='yXQM5uF2jS6es16SJzNHfg', stars=5.0, date=datetime.datetime(2017, 1, 14, 21, 30, 33), TrainTest=0),
 Row(business_id='qx6WhZ42eDKmBchZDax4dQ', user_id='DzZ7piLBF-WsJxqosfJgtA', stars=5.0, date=datetime.datetime(2017, 3, 27, 1, 14, 37), TrainTest=0),
 Row(business_id='d_L-rfS1vT3JMzgCUGtiow', user_id='2mxBNBeFrgDszqGS5tdEHA', stars=5.0, date=datetime.datetime(2016, 7, 25, 3, 57, 19), TrainTest=0)]

In [14]:
import math
import operator
# using average rate as Baseline model
meanRating = df.rdd.map(lambda x: x[2]).mean()
baselineRmse = math.sqrt(
    dftest.rdd.map(lambda x: (meanRating - x[2]) ** 2).reduce(operator.add) / dftest.count()
)
print("Baseline Model (Rating Average for all users and movies) Performance on Test Set")
print("baseline performance on test set: ", baselineRmse)

Baseline Model (Rating Average for all users and movies) Performance on Test Set
baseline performance on test set:  1.5669363541126484
