# Atividade Integradora

In [1]:
import findspark as fs

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.ml.feature import StopWordsRemover
import pandas as pd
import seaborn as sns
sns.set(style="ticks", palette="pastel")
import os
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
spark_location='/Users/vivi/server/spark' # Set your own
java8_location= '/Library/Java/JavaVirtualMachines/jdk1.8.0_251.jdk/Contents/Home/' # Set your own
os.environ['JAVA_HOME'] = java8_location
fs.init(spark_home=spark_location)

In [4]:
datapath = 'data'

In [5]:
files = sorted(os.listdir(datapath))

In [6]:
files

['.DS_Store',
 'dataset list.rtf',
 'yelp_academic_dataset_business.json',
 'yelp_academic_dataset_checkin.json',
 'yelp_academic_dataset_review.json',
 'yelp_academic_dataset_tip.json',
 'yelp_academic_dataset_user.json']

In [7]:
!head data/yelp_academic_dataset_review.json

{"review_id":"xQY8N_XvtGbearJ5X4QryQ","user_id":"OwjRMXRC0KyPrIlcjaXeFQ","business_id":"-MhfebM0QIsKt87iDN-FNw","stars":2.0,"useful":5,"funny":0,"cool":0,"text":"As someone who has worked with many museums, I was eager to visit this gallery on my most recent trip to Las Vegas. When I saw they would be showing infamous eggs of the House of Faberge from the Virginia Museum of Fine Arts (VMFA), I knew I had to go!\n\nTucked away near the gelateria and the garden, the Gallery is pretty much hidden from view. It's what real estate agents would call \"cozy\" or \"charming\" - basically any euphemism for small.\n\nThat being said, you can still see wonderful art at a gallery of any size, so why the two *s you ask? Let me tell you:\n\n* pricing for this, while relatively inexpensive for a Las Vegas attraction, is completely over the top. For the space and the amount of art you can fit in there, it is a bit much.\n* it's not kid friendly at all. Seriously, don't bring them.\n* the security is n

In [8]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Integradora Yelp') \
    .config("spark.ui.port", "4060") \
    .getOrCreate()

In [9]:
sc = spark.sparkContext

In [72]:
spark#.stop()

## Bases

In [11]:
usr_raw = spark.read.json(datapath+'/yelp_academic_dataset_user.json')

In [12]:
rv_raw = spark.read.json(datapath+'/yelp_academic_dataset_review.json')

In [13]:
bz_raw = spark.read.json(datapath+'/yelp_academic_dataset_business.json')

In [14]:
tp_raw = spark.read.json(datapath+'/yelp_academic_dataset_tip.json')

In [15]:
bz_raw.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [84]:
bz_raw.select('attributes').show()

+--------------------+
|          attributes|
+--------------------+
|[,,,,,,, True,, T...|
|[,,,,,,,,,,, True...|
|                null|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,,,,,,,,, True,,...|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,, 'none',,,,, F...|
|[,,,,,,, False,,,...|
|                null|
|[,,,,,,,,, True,,...|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,,,,,,,,, True, ...|
|[,,,,,,,,, True,,...|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,,,,,,,,, True,,...|
|[False,,,,,,, Tru...|
+--------------------+
only showing top 20 rows



In [41]:
tp_raw.show()

+--------------------+----------------+-------------------+--------------------+--------------------+
|         business_id|compliment_count|               date|                text|             user_id|
+--------------------+----------------+-------------------+--------------------+--------------------+
|UYX5zL_Xj9WEc_Wp-...|               0|2013-11-26 18:20:08|Here for a quick mtg|hf27xTME3EiCp6NL6...|
|Ch3HkwQYv1YKw_FO0...|               0|2014-06-15 22:26:45|Cucumber strawber...|uEvusDwoSymbJJ0au...|
|rDoT-MgxGRiYqCmi0...|               0|2016-07-18 22:03:42|Very nice good se...|AY-laIws3S7YXNl_f...|
|OHXnDV01gLokiX1EL...|               0|2014-06-06 01:10:34|It's a small plac...|Ue_7yUlkEbX4AhnYd...|
|GMrwDXRlAZU2zj5nH...|               0|2011-04-08 18:12:01|8 sandwiches, $24...|LltbT_fUMqZ-ZJP-v...|
|ALwAlxItASeEs2vYA...|               0|2015-05-20 20:17:38|Great ramen! Not ...|HHNBqfbDR8b1iq-QG...|
|d_L-rfS1vT3JMzgCU...|               0|2014-09-01 01:23:48|Cochinita Pibil w...|r0

In [16]:
bz_raw.createOrReplaceTempView('bz')
rv_raw.createOrReplaceTempView('rv')
usr_raw.createOrReplaceTempView('usr')
tp_raw.createOrReplaceTempView('tp')

In [17]:
print(spark.catalog.listTables())

[Table(name='bz', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='rv', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='tp', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='usr', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [18]:
bz_raw.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [19]:
usr_raw.columns

['average_stars',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool',
 'elite',
 'fans',
 'friends',
 'funny',
 'name',
 'review_count',
 'useful',
 'user_id',
 'yelping_since']

In [20]:
rv_raw.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

In [21]:
tp_raw.columns

['business_id', 'compliment_count', 'date', 'text', 'user_id']

## Joins

Juntando as informações de reviews, estabelecimentos da cidade escolhidas e usuários que frequentam esses estabelecimentos.

- Reviews + Business

In [22]:
base = spark.sql("""
        SELECT A.*,
        B.address,
        B.categories,
        B.city,
        B.hours,
        B.is_open,
        B.latitude,
        B.longitude,
        B.name AS name_bz,
        B.postal_code,
        B.review_count,
        B.stars AS stars_bz,
        B.state

        FROM rv as A 
        LEFT JOIN bz as B
        ON A.business_id = B.business_id
        
        WHERE B.city = 'Toronto'
        AND B.state = 'ON'
        AND B.review_count > 20
        
        """)

In [21]:
base.show(5)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+-------------------+--------------------+-------+-----+-------+----------+-----------+--------------------+-----------+------------+--------+-----+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|            address|          categories|   city|hours|is_open|  latitude|  longitude|             name_bz|postal_code|review_count|stars_bz|state|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+-------------------+--------------------+-------+-----+-------+----------+-----------+--------------------+-----------+------------+--------+-----+
|2OFWvbHVwvnva7GxP...|   0|2017-07-29 01:46:38|    0|7aA5gWum--OxUmk4z...|  4.0|We ordered the je...|     0|_NMdJpHLvmSnTo1Kr...|912  Bloor Street W|Caribbean, Restau...

In [23]:
base.createOrReplaceTempView('base')

- Contagem da quantidade de linhas para garantir que a integridade do dataset ser mantém ao longo do processamento.

In [24]:
#linhas na base de reviews + business
spark.sql('''
            SELECT Count(*)
            FROM base
            ''').show()

+--------+
|count(1)|
+--------+
|  490518|
+--------+



- (Reviews + Business) + Users

In [24]:
base1 = spark.sql("""
        SELECT A.*,
        B.average_stars AS stars_usr,
        B.compliment_cool,
        B.compliment_cute,
        B.compliment_funny,
        B.compliment_hot,
        B.compliment_list,
        B.compliment_more,
        B.compliment_note,
        B.compliment_photos,
        B.compliment_plain,
        B.compliment_profile,
        B.compliment_writer,
        B.cool AS cool_usr,
        B.elite AS elite_usr,
        B.fans,
        B.friends,
        B.funny AS funny_usr,
        B.name AS name_usr,
        B.review_count AS review_count_usr,
        B.useful AS useful_usr,
        B.yelping_since

        FROM base as A 
        
        LEFT JOIN usr as B
        ON A.user_id = B.user_id 
        
        """)

In [25]:
base1.createOrReplaceTempView('base1')

In [28]:
#linhas na base de reviews + business + users
spark.sql('''
            SELECT Count(*)
            FROM base1
            ''').show()

+--------+
|count(1)|
+--------+
|  490518|
+--------+



In [26]:
aux = spark.sql('''
            SELECT user_id, city, yelping_since,
            COUNT(review_id) AS city_review_counter,
            review_count_usr
            
            FROM base1
            
            GROUP BY user_id, review_count_usr, city, yelping_since
            ORDER BY city_review_counter DESC        
            
            ''')

In [27]:
aux.createOrReplaceTempView('aux')

In [31]:
aux.show()

+--------------------+-------+-------------------+-------------------+----------------+
|             user_id|   city|      yelping_since|city_review_counter|review_count_usr|
+--------------------+-------+-------------------+-------------------+----------------+
|CxDOIDnH8gp9KXzpB...|Toronto|2009-11-09 20:44:45|               1199|            6633|
|Q9mA60HnY87C1TW5k...|Toronto|2010-08-29 01:34:42|                795|            1376|
|O3pSxv1SyHpY4qi4Q...|Toronto|2010-07-04 02:52:05|                634|            1353|
|0BBUmH7Krcax1RZgb...|Toronto|2010-03-18 14:40:42|                607|            1182|
|gwIqbXEXijQNgdESV...|Toronto|2010-11-17 23:13:51|                565|            1106|
|ic-tyi1jElL_umxZV...|Toronto|2014-02-19 22:37:38|                538|            1893|
|TbhyP24zYZqZ2VJZg...|Toronto|2010-03-17 18:05:59|                495|             862|
|1fNQRju9gmoCEvbPQ...|Toronto|2013-02-24 06:35:51|                488|             710|
|FREeRQtjdJU83AFtd...|Toronto|20

Aparentemente os usuários fazem reviews em estabelecimentos não só em Toronto. Para incluir essa informação no modelo, será criada uma variável com a relação entre as quantidade de reviews do usuário na cidade pela quantidade total de reviews do usuário.

- Média de reviews por usuário na cidade e total

In [32]:
spark.sql('''
            SELECT AVG(city_review_counter), 
             AVG(review_count_usr)
            
            FROM aux       
            
            ''').show()

+------------------------+---------------------+
|avg(city_review_counter)|avg(review_count_usr)|
+------------------------+---------------------+
|       4.324411531340915|    35.71904258132769|
+------------------------+---------------------+



- Remoção de usuários com apenas 1 review na cidade

In [28]:
base2 = spark.sql('''
            SELECT A.*,
            B.city_review_counter,
            (B.city_review_counter/B.review_count_usr) AS city_review_ratio
            
            FROM base1 as A
            
            LEFT JOIN aux as B
            ON A.user_id = B.user_id
            
            WHERE B.city_review_counter > 1
            
            ''')

In [29]:
base2.createOrReplaceTempView('base2')

In [35]:
#linhas na base de reviews + business + users
spark.sql('''
            SELECT Count(*)
            FROM base2
            ''').show()

+--------+
|count(1)|
+--------+
|  429793|
+--------+



- Classificação das avaliações em Boa (1 - maior do que 4) e Ruim ou inexistente (0 - menor do que 4).

In [30]:
base3 = spark.sql("""
        SELECT *,
        (CASE WHEN stars >=4 THEN 1 ELSE 0 END) as class_rv,
        (CASE WHEN stars_bz >=4 THEN 1 ELSE 0 END) as class_bz,
        (CASE WHEN stars_usr >=4 THEN 1 ELSE 0 END) as class_usr
        
        FROM base2
        
        """)

In [37]:
base3.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id',
 'address',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name_bz',
 'postal_code',
 'review_count',
 'stars_bz',
 'state',
 'stars_usr',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool_usr',
 'elite_usr',
 'fans',
 'friends',
 'funny_usr',
 'name_usr',
 'review_count_usr',
 'useful_usr',
 'yelping_since',
 'city_review_counter',
 'city_review_ratio',
 'class_rv',
 'class_bz',
 'class_usr']

In [31]:
base3.createOrReplaceTempView('base3')

In [39]:
spark.sql('''
            SELECT Count(*)
            FROM base2
            ''').show()

+--------+
|count(1)|
+--------+
|  429793|
+--------+



- Base + Tips

In [32]:
spark.sql('''
            SELECT business_id, user_id, 
            count(text) AS tips_counter,
            sum(compliment_count) as total_compliments
            
            FROM tp
            
            GROUP BY business_id, user_id
            ORDER BY total_compliments DESC
            ''').show()

+--------------------+--------------------+------------+-----------------+
|         business_id|             user_id|tips_counter|total_compliments|
+--------------------+--------------------+------------+-----------------+
|BQqwIYQuo2W94smjr...|mkbx55W8B8aPLgDqe...|          76|               65|
|OMRYQihVjqqzjoNoQ...|mkbx55W8B8aPLgDqe...|          73|               30|
|QhXBIQWUmQxuVErdw...|mkbx55W8B8aPLgDqe...|          34|               30|
|QsKhwKYB3YeWXqpIP...|mkbx55W8B8aPLgDqe...|          38|               30|
|55E0-qUHa7Kzqz8rO...|mkbx55W8B8aPLgDqe...|          45|               28|
|gWeYW0E5Tfmmj_9fu...|mkbx55W8B8aPLgDqe...|          14|               19|
|YILyHegzhy1vlc_LN...|mkbx55W8B8aPLgDqe...|          29|               16|
|qpdMFF6Y31ZAtE1R1...|mkbx55W8B8aPLgDqe...|          16|               16|
|gwdQwe1JHLe-vPY9P...|wTfb2nfzPIyFcYQAr...|           1|               15|
|vMRWkz0wb3g4HjW_O...|tQPk4JiBPsx7NSIDb...|         164|               15|
|0QzCeORfF8EY34UOD...|mkb

In [33]:
base4 = spark.sql('''
            SELECT A.*,
            IFNULL(B.compliment_count,0) AS compliment_count_tip,
            IFNULL(B.text,'') AS tip
            
            FROM base3 as A
            
            LEFT JOIN tp as B
            ON (A.user_id = B.user_id AND A.business_id = B.business_id)
            
            ''')

In [34]:
base4.select('business_id', 'user_id','tip','compliment_count_tip').show()

+--------------------+--------------------+--------------------+--------------------+
|         business_id|             user_id|                 tip|compliment_count_tip|
+--------------------+--------------------+--------------------+--------------------+
|BxCzy1WOVxOrr_G7V...|--Qh8yKWAvIP4V4K8...|                    |                   0|
|JgaQFKxW-Bnfc7r5E...|--Qh8yKWAvIP4V4K8...|                    |                   0|
|U_jPOTd-Z5KATbTYn...|-3PTUP443q6hQESLK...|                    |                   0|
|b6sUcJj_xbeYrKKOs...|-3PTUP443q6hQESLK...|                    |                   0|
|I_3P0ns6_3najPBxn...|-9RU4LuI_TfYgv9rB...|If you get gravy ...|                   0|
|ar3FKybOl9yazviTB...|-B4Cf2XLkPr9qMlLP...|                    |                   0|
|fe6YW5wLAeWQBATPf...|-EWgEX_dc1Xv83dVt...|                    |                   0|
|iGEvDk6hsizigmXhD...|-Fj7FUTxUy04imKQ8...|                    |                   0|
|XdY36-Qh_DxnCaFpT...|-Iy3SMroVffvis2KZ...|           

In [61]:
base4.select('text','tip').show()

+--------------------+--------------------+
|                text|                 tip|
+--------------------+--------------------+
|I came here for b...|                   0|
|We recently had a...|                   0|
|I got a small ste...|                   0|
|- The only thing ...|                   0|
|I have been comin...|If you get gravy ...|
|You really can't ...|                   0|
|Cutest cafe in Qu...|                   0|
|Probably one if m...|                   0|
|I just went to To...|                   0|
|My mom and I had ...|                   0|
|I was impressed w...|                   0|
|Service is good a...|Have the wings. T...|
|Horrible service....|                   0|
|Stayed here few w...|                   0|
|the shop is cute,...|                   0|
|This place makes ...|                   0|
|Came here and imm...|                   0|
|Wow! I was in the...|                   0|
|I do honestly wan...|                   0|
|This little basem...|          

## Tratamento de texto

In [35]:
def word_clean(sdf,col,new_col):
    rv1 = sdf.withColumn(new_col,f.regexp_replace(f.col(col), "'d", " would"))
    rv2 = rv1.withColumn(new_col,f.regexp_replace(f.col(new_col), "'ve", " have"))
    rv3 = rv2.withColumn(new_col,f.regexp_replace(f.col(new_col), "'s", " is"))
    rv4 = rv3.withColumn(new_col,f.regexp_replace(f.col(new_col), "'re", " are"))
    rv5 = rv4.withColumn(new_col,f.regexp_replace(f.col(new_col), "n't", " not"))
    rv6 = rv5.withColumn(new_col,f.regexp_replace(f.col(new_col), '\W+', " "))
    rv7 = rv6.withColumn(new_col,f.lower(f.col(new_col)))
    return rv7

In [36]:
base5 = word_clean(base4,'text','text_clean')
base6 = word_clean(base5,'tip','tip_clean')

In [37]:
base6.select('text_clean','tip_clean').show()

+--------------------+--------------------+
|          text_clean|           tip_clean|
+--------------------+--------------------+
|i came here for b...|                    |
|we recently had a...|                    |
|i got a small ste...|                    |
| the only thing i...|                    |
|i have been comin...|if you get gravy ...|
|you really ca not...|                    |
|cutest cafe in qu...|                    |
|probably one if m...|                    |
|i just went to to...|                    |
|my mom and i had ...|                    |
|i was impressed w...|                    |
|service is good a...|have the wings th...|
|horrible service ...|                    |
|stayed here few w...|                    |
|the shop is cute ...|                    |
|this place makes ...|                    |
|came here and imm...|                    |
|wow i was in the ...|                    |
|i do honestly wan...|                    |
|this little basem...|          

- Contagem de amigos de cada usuário

In [38]:
base7 = base6.withColumn('friends_counter', f.size(f.split(f.col('friends'),',')))

In [39]:
base7.createOrReplaceTempView('base7')

In [40]:
base8 = spark.sql('''
            SELECT *,
            (CASE WHEN friends = 'None' THEN 0 ELSE friends_counter END) as friends_count
            FROM base7
            ''')

In [89]:
df = base8.select('friends','friends_counter','friends_count').limit(10).toPandas()

In [90]:
df.dtypes

friends            object
friends_counter     int32
friends_count       int32
dtype: object

In [91]:
df

Unnamed: 0,friends,friends_counter,friends_count
0,kUWW9YR-2xC9YUSavBro8w,1,1
1,kUWW9YR-2xC9YUSavBro8w,1,1
2,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
3,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
4,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
5,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
6,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
7,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
8,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
9,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27


In [70]:
base8.select('friends','friends_counter','friends_count').show()

+--------------------+---------------+-------------+
|             friends|friends_counter|friends_count|
+--------------------+---------------+-------------+
|8lfS8bZkamISfRfWi...|             87|           87|
|8lfS8bZkamISfRfWi...|             87|           87|
|ALxRHdxbefyWvZ-7n...|             27|           27|
|ALxRHdxbefyWvZ-7n...|             27|           27|
|-A0mo_vdbn_-_omov...|             57|           57|
|                None|              1|            0|
|hD3xxj6CyunXze3sC...|             73|           73|
|5e-12hnuO5qK8loS9...|            205|          205|
|2DNajYvAMz5iV6Dma...|              3|            3|
|wVvjv0lOGFXvpXdNf...|             11|           11|
|                None|              1|            0|
|LI86K5NbwF0KDgi0_...|              1|            1|
|Ufw2jb2JCJELgCkxG...|              2|            2|
|ZxKqSQC6ccxSjYIM-...|             37|           37|
|Pp7ksD7Ccb-dmxgyW...|             85|           85|
|ZdpK8Jw_2Xi9I0X9F...|              7|        

Concatenando review e tip

In [41]:
base9 = base8.withColumn('review_tip', f.concat(f.col('text_clean'),f.lit(' '), f.col('tip_clean')))

In [42]:
base9.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id',
 'address',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name_bz',
 'postal_code',
 'review_count',
 'stars_bz',
 'state',
 'stars_usr',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool_usr',
 'elite_usr',
 'fans',
 'friends',
 'funny_usr',
 'name_usr',
 'review_count_usr',
 'useful_usr',
 'yelping_since',
 'city_review_counter',
 'city_review_ratio',
 'class_rv',
 'class_bz',
 'class_usr',
 'compliment_count_tip',
 'tip',
 'text_clean',
 'tip_clean',
 'friends_counter',
 'friends_count',
 'review_tip']

In [43]:
base9.select('text_clean','tip_clean','review_tip','stars','compliment_count_tip','funny','cool').show()

+--------------------+--------------------+--------------------+-----+--------------------+-----+----+
|          text_clean|           tip_clean|          review_tip|stars|compliment_count_tip|funny|cool|
+--------------------+--------------------+--------------------+-----+--------------------+-----+----+
|i came here for b...|                    |i came here for b...|  3.0|                   0|    0|   2|
|we recently had a...|                    |we recently had a...|  2.0|                   0|    1|   0|
|i got a small ste...|                    |i got a small ste...|  5.0|                   0|    0|   0|
| the only thing i...|                    | the only thing i...|  4.0|                   0|    0|   1|
|i have been comin...|if you get gravy ...|i have been comin...|  5.0|                   0|    0|   1|
|you really ca not...|                    |you really ca not...|  4.0|                   0|    0|   0|
|cutest cafe in qu...|                    |cutest cafe in qu...|  5.0|   

In [44]:
base9.createOrReplaceTempView('base9')

In [45]:
spark.sql('''
        SELECT stars, count(tip_clean) as tip_counter
        FROM base9
        GROUP BY stars
        ORDER BY tip_counter DESC
        ''').show()

+-----+-----------+
|stars|tip_counter|
+-----+-----------+
|  4.0|     146358|
|  5.0|     131097|
|  3.0|      79401|
|  2.0|      42340|
|  1.0|      36164|
+-----+-----------+



In [64]:
base10 = base9.withColumn('id',f.row_number())

In [66]:
base10.select('id','review_id','stars').show()

Py4JJavaError: An error occurred while calling o270.showString.
: java.lang.UnsupportedOperationException: Cannot evaluate expression: row_number()
	at org.apache.spark.sql.catalyst.expressions.Unevaluable$class.doGenCode(Expression.scala:261)
	at org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate.doGenCode(interfaces.scala:348)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:108)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:105)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:105)
	at org.apache.spark.sql.catalyst.expressions.Cast.doGenCode(Cast.scala:660)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:108)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:105)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:105)
	at org.apache.spark.sql.catalyst.expressions.Cast.genCode(Cast.scala:655)
	at org.apache.spark.sql.catalyst.expressions.Alias.genCode(namedExpressions.scala:155)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$6.apply(basicPhysicalOperators.scala:62)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$6.apply(basicPhysicalOperators.scala:62)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:62)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:403)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:47)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:37)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:544)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:598)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


- Remoção de colunas que não serão utilizadas na primeira modelagem

In [54]:
base_final = base10.drop('friends','friends_counter','name_usr','city','review_id', 'address','state', 'hours','text_clean','text','tip','tip_clean','elite_usr')

In [47]:
base_final.columns

['business_id',
 'cool',
 'date',
 'funny',
 'stars',
 'useful',
 'user_id',
 'categories',
 'is_open',
 'latitude',
 'longitude',
 'name_bz',
 'postal_code',
 'review_count',
 'stars_bz',
 'stars_usr',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool_usr',
 'fans',
 'funny_usr',
 'review_count_usr',
 'useful_usr',
 'yelping_since',
 'city_review_counter',
 'city_review_ratio',
 'class_rv',
 'class_bz',
 'class_usr',
 'compliment_count_tip',
 'friends_count',
 'review_tip']

- Gravar base final em csv

In [48]:
base_final.limit(50000).write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp.csv')

## Base para modelo de tópicos

In [67]:
words = base_final.select('review_id','user_id','business_id','categories','stars','review_tip')

In [68]:
words2 = words.withColumn('category', f.explode(f.split(f.col('categories'),', ')))

In [69]:
words3 = words2.drop('categories')

In [70]:
words3.show()

+--------------------+--------------------+--------------------+-----+--------------------+--------------------+
|           review_id|             user_id|         business_id|stars|          review_tip|            category|
+--------------------+--------------------+--------------------+-----+--------------------+--------------------+
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|  3.0|i came here for b...|        Comfort Food|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|  3.0|i came here for b...|          Tapas Bars|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|  3.0|i came here for b...|               Vegan|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|  3.0|i came here for b...|  Breakfast & Brunch|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|  3.0|i came here for b...|         Gluten-Free|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|  3.0|i came here for b...|     

In [None]:
#words4 = words3.withColumn('word', f.explode(f.split(f.col('review_tip'),' ')))

In [71]:
words3.write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp_words.csv')

# Matriz de distâncias

- Preparação para criação de matriz de distâncias baseada na nota de cada avaliação.

In [112]:
dist1 = base_final.select('user_id','categories','stars')

In [97]:
dist1.show()

+--------------------+--------------------+-----+
|             user_id|          categories|stars|
+--------------------+--------------------+-----+
|-4Anvj46CWf57KWI9...|Restaurants, Japa...|  3.0|
|-4Anvj46CWf57KWI9...|Lounges, Bars, Ni...|  4.0|
|-JBB4-ALR07J6Pbx4...| Restaurants, Korean|  4.0|
|-JBB4-ALR07J6Pbx4...|Bakeries, Dessert...|  1.0|
|-JBB4-ALR07J6Pbx4...|Food, Ethnic Food...|  4.0|
|-JBB4-ALR07J6Pbx4...|Food, Restaurants...|  2.0|
|-JBB4-ALR07J6Pbx4...|Chinese, Restaura...|  4.0|
|-JBB4-ALR07J6Pbx4...|Restaurants, Cana...|  4.0|
|-JBB4-ALR07J6Pbx4...|Lebanese, Middle ...|  3.0|
|-JBB4-ALR07J6Pbx4...|Vegetarian, Resta...|  3.0|
|-JBB4-ALR07J6Pbx4...|Restaurants, Tapa...|  4.0|
|-JBB4-ALR07J6Pbx4...|Parks, Local Serv...|  4.0|
|-JBB4-ALR07J6Pbx4...|Food, Ice Cream &...|  5.0|
|-JBB4-ALR07J6Pbx4...|Italian, Caterers...|  5.0|
|-JBB4-ALR07J6Pbx4...|Restaurants, Mexican|  4.0|
|-JBB4-ALR07J6Pbx4...|Bakeries, Food, R...|  3.0|
|-JBB4-ALR07J6Pbx4...|Japanese, Sushi B...|  4.0|


In [123]:
dist2 = dist1.withColumn('category', f.explode(f.split(f.col('categories'),', ')))

In [124]:
dist2.show()

+--------------------+--------------------+-----+--------------+
|             user_id|          categories|stars|      category|
+--------------------+--------------------+-----+--------------+
|-4Anvj46CWf57KWI9...|Restaurants, Japa...|  3.0|   Restaurants|
|-4Anvj46CWf57KWI9...|Restaurants, Japa...|  3.0|      Japanese|
|-4Anvj46CWf57KWI9...|Restaurants, Japa...|  3.0|    Sushi Bars|
|-4Anvj46CWf57KWI9...|Lounges, Bars, Ni...|  4.0|       Lounges|
|-4Anvj46CWf57KWI9...|Lounges, Bars, Ni...|  4.0|          Bars|
|-4Anvj46CWf57KWI9...|Lounges, Bars, Ni...|  4.0|     Nightlife|
|-JBB4-ALR07J6Pbx4...| Restaurants, Korean|  4.0|   Restaurants|
|-JBB4-ALR07J6Pbx4...| Restaurants, Korean|  4.0|        Korean|
|-JBB4-ALR07J6Pbx4...|Bakeries, Dessert...|  1.0|      Bakeries|
|-JBB4-ALR07J6Pbx4...|Bakeries, Dessert...|  1.0|      Desserts|
|-JBB4-ALR07J6Pbx4...|Bakeries, Dessert...|  1.0|          Food|
|-JBB4-ALR07J6Pbx4...|Food, Ethnic Food...|  4.0|          Food|
|-JBB4-ALR07J6Pbx4...|Foo

In [127]:
dist2.createOrReplaceTempView('dist')

- Quantidade de usuários e estabelecimentos

In [120]:
spark.sql('''
            SELECT Count(DISTINCT user_id)
            FROM dist
            ''').show()

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                  52705|
+-----------------------+



In [121]:
spark.sql('''
            SELECT Count(DISTINCT categories)
            FROM dist
            ''').show()

+--------------------------+
|count(DISTINCT categories)|
+--------------------------+
|                      4214|
+--------------------------+



In [128]:
spark.sql('''
            SELECT Count(DISTINCT category)
            FROM dist
            ''').show()

+------------------------+
|count(DISTINCT category)|
+------------------------+
|                     613|
+------------------------+



- Aumentando o limite máximo de coluna de acordo com o número de estabelecimentos

In [129]:
#spark.conf.set('spark.sql.pivotMaxValues', u'21000')

In [130]:
dist3 = dist2.groupBy("user_id").pivot("category").mean("stars")

In [132]:
dist4 = dist3.fillna(0)

In [133]:
dist4.show()

+--------------------+----------+-----------+-----------+--------------+-----------+-----------+-------------------+------+-------+--------+---------------+----------------+--------+--------------------+--------------+----------------------+---------------+---------------+--------+----------+----------+-------------------+---------+-------+-------+-------+---------+-----------+-------------+-----------+-----------+------------+-------------+--------------------+------------+----------+--------------+-------------------+---------------------+-----------+----------+------------+---------------------+------+--------+-----------+---------------------+--------+-------+-------------+----+---------------+------+-------+-------------+---------------+----+--------+------------+---------+----------+-------+----------------+--------+------------+-----------------------+---------+----------+-----+-------+---------------------+-------------+-------+----------+-----+----------+-------+------+-------

In [134]:
dist4.write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp_dist.csv')