In [1]:
import os
import pandas as pd
import pyspark
from pyspark import SparkFiles
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

seed = 100
filePath = "yelp_dataset/review_cleaned.csv"
sc = SparkContext()

In [2]:
dfPandas = pd.read_csv(filePath)
dfPandas.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,NZnhc2sEQy3RmzKTZnqtwQ,0.0,2017-01-14 21:30:33,0.0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0.0,yXQM5uF2jS6es16SJzNHfg
1,WTqjgwHlXbSFevF32_DJVw,0.0,2016-11-09 20:09:03,0.0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3.0,n6-Gk65cPZL6Uz8qRm3NYw
2,3fw2X5bZYeW9xCz_zGhOHg,5.0,2016-05-07 01:21:02,4.0,G7XHMxG0bx9oBJNECG4IFg,3.0,Tracy dessert had a big name in Hong Kong and ...,5.0,jlu4CztcSxrKx56ba1a5AQ
3,YvrylyuWgbP90RgMqZQVnQ,0.0,2017-04-07 21:27:49,0.0,svK3nBU7Rk8VfGorlrN52A,5.0,You can't really find anything wrong with this...,0.0,NJlxGtouq06hhC7sS2ECYw
4,NyLYY8q1-H3hfsTwuwLPCg,0.0,2015-01-03 22:47:34,0.0,1wVA2-vQIuW_ClmXkDxqMQ,4.0,Great lunch today. Staff was very helpful in a...,0.0,86J5DwcFk4f4In1Vxe2TvA


In [14]:
dfPandas.shape

(3148044, 9)

In [3]:
columnsRatingMatrix = ['user_id', 'business_id', 'stars']
dfPandas[columnsRatingMatrix].to_csv('yelp_dataset/yelp_ratings.csv', index=False)

In [4]:
print("NaNs: ", dfPandas['text'].isnull().sum())
dfPandas['text'] = dfPandas['text'].fillna('')

NaNs:  1


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05)
count_vectorizer.fit(dfPandas['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.95, max_features=None, min_df=0.05,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(count_vectorizer.transform(dfPandas['text']))

In [10]:
count_vectorizer.get_feature_names()

['10',
 'about',
 'after',
 'again',
 'all',
 'also',
 'always',
 'am',
 'amazing',
 'an',
 'and',
 'another',
 'any',
 'are',
 'area',
 'around',
 'as',
 'asked',
 'at',
 'away',
 'awesome',
 'back',
 'bad',
 'bar',
 'be',
 'because',
 'been',
 'before',
 'being',
 'best',
 'better',
 'big',
 'bit',
 'both',
 'busy',
 'but',
 'by',
 'came',
 'can',
 'check',
 'cheese',
 'chicken',
 'clean',
 'come',
 'coming',
 'could',
 'customer',
 'day',
 'definitely',
 'delicious',
 'did',
 'didn',
 'different',
 'dinner',
 'do',
 'don',
 'done',
 'down',
 'drinks',
 'eat',
 'enough',
 'even',
 'ever',
 'every',
 'everything',
 'excellent',
 'experience',
 'favorite',
 'feel',
 'few',
 'find',
 'first',
 'food',
 'for',
 'found',
 'fresh',
 'friendly',
 'from',
 'get',
 'give',
 'go',
 'going',
 'good',
 'got',
 'great',
 'had',
 'happy',
 'has',
 'have',
 'he',
 'her',
 'here',
 'highly',
 'his',
 'home',
 'hot',
 'how',
 'if',
 'in',
 'into',
 'is',
 'it',
 'just',
 'know',
 'last',
 'like',
 'l

In [11]:
tfidf = tfidf_transformer.transform(count_vectorizer.transform(dfPandas['text']))

In [12]:
tfidf

<3148044x234 sparse matrix of type '<class 'numpy.float64'>'
	with 114857449 stored elements in Compressed Sparse Row format>

In [13]:
import pickle

pickle.dump(count_vectorizer, open('yelp_dataset/countVectorizer.pkl', 'wb'))
pickle.dump(tfidf_transformer, open('yelp_dataset/tfidfTransformer.pkl', 'wb'))

In [15]:
import scipy

scipy.sparse.save_npz('yelp_dataset/textTransform.npz', tfidf)

In [16]:
scoreMatrix = pd.DataFrame(
    data=tfidf.toarray(),
    columns=count_vectorizer.get_feature_names(),
)

In [17]:
scoreMatrix.to_csv('yelp_dataset/textTransformPandas.csv', index=False)