# Atividade Integradora

## Criando Ambiente Spark

In [1]:
import findspark as fs
from pyspark.sql import SparkSession

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.ml.feature import StopWordsRemover
import pandas as pd
import seaborn as sns
sns.set(style="ticks", palette="pastel")
import os
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#MAC Local
spark_location='/Users/vivi/server/spark' # Set your own
java8_location= '/Library/Java/JavaVirtualMachines/jdk1.8.0_251.jdk/Contents/Home/' # Set your own
os.environ['JAVA_HOME'] = java8_location
fs.init(spark_home=spark_location)

In [4]:
datapath = 'data'
#datapath = '../data/yelp'

In [5]:
files = sorted(os.listdir(datapath))

In [6]:
files

['.DS_Store',
 'dataset list.rtf',
 'yelp_academic_dataset_business.json',
 'yelp_academic_dataset_checkin.json',
 'yelp_academic_dataset_review.json',
 'yelp_academic_dataset_tip.json',
 'yelp_academic_dataset_user.json']

In [7]:
#!head data/yelp_academic_dataset_review.json

In [8]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Integradora Yelp') \
    .config("spark.ui.port", "4060") \
    .getOrCreate()

spark = SparkSession.builder \
            .master('local[8]') \
            .appName('Yelp Integradora') \
            .getOrCreate()

In [9]:
sc = spark.sparkContext

In [10]:
spark#.stop()

## Importando as Bases Origem - Raw

In [11]:
usr_raw = spark.read.json(datapath+'/yelp_academic_dataset_user.json')
#usr_raw = spark.read.json('../data/yelp/yelp_academic_dataset_user.json')

In [12]:
rv_raw = spark.read.json(datapath+'/yelp_academic_dataset_review.json')
#rv_raw = spark.read.json('../data/yelp/yelp_academic_dataset_review.json')

In [13]:
bz_raw = spark.read.json(datapath+'/yelp_academic_dataset_business.json')
#bz_raw = spark.read.json('../data/yelp/yelp_academic_dataset_business.json')

In [14]:
tp_raw = spark.read.json(datapath+'/yelp_academic_dataset_tip.json')
#tp_raw = spark.read.json('../data/yelp/yelp_academic_dataset_tip.json')

In [15]:
bz_raw.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [16]:
bz_raw.select('attributes').show()

+--------------------+
|          attributes|
+--------------------+
|[,,,,,,, True,, T...|
|[,,,,,,,,,,, True...|
|                null|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,,,,,,,,, True,,...|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,, 'none',,,,, F...|
|[,,,,,,, False,,,...|
|                null|
|[,,,,,,,,, True,,...|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,,,,,,,,, True, ...|
|[,,,,,,,,, True,,...|
|[,,,,,,, True,, T...|
|[,,,,,,,,, True,,...|
|[,,,,,,,,, True,,...|
|[False,,,,,,, Tru...|
+--------------------+
only showing top 20 rows



In [17]:
tp_raw.show()

+--------------------+----------------+-------------------+--------------------+--------------------+
|         business_id|compliment_count|               date|                text|             user_id|
+--------------------+----------------+-------------------+--------------------+--------------------+
|UYX5zL_Xj9WEc_Wp-...|               0|2013-11-26 18:20:08|Here for a quick mtg|hf27xTME3EiCp6NL6...|
|Ch3HkwQYv1YKw_FO0...|               0|2014-06-15 22:26:45|Cucumber strawber...|uEvusDwoSymbJJ0au...|
|rDoT-MgxGRiYqCmi0...|               0|2016-07-18 22:03:42|Very nice good se...|AY-laIws3S7YXNl_f...|
|OHXnDV01gLokiX1EL...|               0|2014-06-06 01:10:34|It's a small plac...|Ue_7yUlkEbX4AhnYd...|
|GMrwDXRlAZU2zj5nH...|               0|2011-04-08 18:12:01|8 sandwiches, $24...|LltbT_fUMqZ-ZJP-v...|
|ALwAlxItASeEs2vYA...|               0|2015-05-20 20:17:38|Great ramen! Not ...|HHNBqfbDR8b1iq-QG...|
|d_L-rfS1vT3JMzgCU...|               0|2014-09-01 01:23:48|Cochinita Pibil w...|r0

In [18]:
bz_raw.createOrReplaceTempView('bz')
rv_raw.createOrReplaceTempView('rv')
usr_raw.createOrReplaceTempView('usr')
tp_raw.createOrReplaceTempView('tp')

In [19]:
print(spark.catalog.listTables())

[Table(name='bz', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='rv', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='tp', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='usr', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [20]:
bz_raw.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [21]:
usr_raw.columns

['average_stars',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool',
 'elite',
 'fans',
 'friends',
 'funny',
 'name',
 'review_count',
 'useful',
 'user_id',
 'yelping_since']

In [22]:
rv_raw.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

In [23]:
tp_raw.columns

['business_id', 'compliment_count', 'date', 'text', 'user_id']

# Criando Base Única Principal

## Unificando as Bases para contrução dos Modelos - "Joins"

Juntando as informações de reviews, estabelecimentos da cidade escolhidas e usuários que frequentam esses estabelecimentos.

### Reviews + Business

In [24]:
base = spark.sql("""
       SELECT A.business_id,
        A.cool AS cool_rv,
        A.date AS date_rv,
        A.funny AS funny_rv,
        A.review_id,
        A.stars AS stars_rv,
        A.text AS text_rv,
        A.useful AS useful_rv,
        A.user_id,        
        
        B.address AS address_bz,
        B.categories AS categories_bz,
        B.city AS city_bz,
        B.hours AS hours_bz,
        B.is_open AS is_open_bz,
        B.latitude AS latitude_bz,
        B.longitude AS longitude_bz,
        B.name AS name_bz,
        B.postal_code AS postal_code_bz,
        B.review_count AS review_count_bz,
        B.stars AS stars_bz,
        B.state AS state_bz
 
        FROM rv as A 
        LEFT JOIN bz as B
        ON A.business_id = B.business_id
        
        WHERE B.city = 'Toronto'
        AND B.state = 'ON'
        AND B.review_count > 20
        AND (B.categories like '%Restaurant%' OR B.categories like '%Food%')
        """)

In [25]:
base.show(5)

+--------------------+-------+-------------------+--------+--------------------+--------+--------------------+---------+--------------------+-------------------+--------------------+-------+--------+----------+-----------+------------+--------------------+--------------+---------------+--------+--------+
|         business_id|cool_rv|            date_rv|funny_rv|           review_id|stars_rv|             text_rv|useful_rv|             user_id|         address_bz|       categories_bz|city_bz|hours_bz|is_open_bz|latitude_bz|longitude_bz|             name_bz|postal_code_bz|review_count_bz|stars_bz|state_bz|
+--------------------+-------+-------------------+--------+--------------------+--------+--------------------+---------+--------------------+-------------------+--------------------+-------+--------+----------+-----------+------------+--------------------+--------------+---------------+--------+--------+
|2OFWvbHVwvnva7GxP...|      0|2017-07-29 01:46:38|       0|7aA5gWum--OxUmk4z...|  

In [26]:
base.createOrReplaceTempView('base')

- Contagem da quantidade de linhas para garantir que a integridade do dataset ser mantém ao longo do processamento.

In [27]:
#linhas na base de reviews + business
spark.sql('''
            SELECT Count(*)
            FROM base
            ''').show()

+--------+
|count(1)|
+--------+
|  429003|
+--------+



### (Reviews + Business) + Users

In [28]:
base1 = spark.sql("""
        SELECT A.*,
        B.average_stars AS stars_usr,
        B.compliment_cool AS compliment_cool_usr,
        B.compliment_cute AS compliment_cute_usr,
        B.compliment_funny AS compliment_funny_usr,
        B.compliment_hot AS compliment_hot_usr,
        B.compliment_list AS compliment_list_usr,
        B.compliment_more AS compliment_more_usr,
        B.compliment_note AS compliment_note_usr,
        B.compliment_photos AS compliment_photos_usr,
        B.compliment_plain AS compliment_plain_usr,
        B.compliment_profile AS compliment_profile_usr,
        B.compliment_writer AS compliment_writer_usr,
        B.cool AS cool_usr,
        B.elite AS elite_usr,
        B.fans AS fans_usr,
        B.friends AS friends_usr,
        B.funny AS funny_usr,
        B.name AS name_usr,
        B.review_count AS review_count_usr,
        B.useful AS useful_usr,
        B.yelping_since AS yelping_since_usr

        FROM base as A LEFT JOIN usr as B
        
        ON A.user_id = B.user_id 
        
        """)

In [29]:
base1.createOrReplaceTempView('base1')

In [30]:
#linhas na base de reviews + business + users
spark.sql('''
            SELECT Count(*)
            FROM base1
            ''').show()

+--------+
|count(1)|
+--------+
|  429003|
+--------+



In [31]:
aux = spark.sql('''
            SELECT user_id, city_bz, yelping_since_usr,
            COUNT(review_id) AS city_review_counter_usr,
            review_count_usr
            
            FROM base1
            
            GROUP BY user_id, review_count_usr, city_bz, yelping_since_usr
            ORDER BY city_review_counter_usr DESC        
            
            ''')

In [32]:
aux.createOrReplaceTempView('aux')

In [33]:
aux.show()

+--------------------+-------+-------------------+-----------------------+----------------+
|             user_id|city_bz|  yelping_since_usr|city_review_counter_usr|review_count_usr|
+--------------------+-------+-------------------+-----------------------+----------------+
|CxDOIDnH8gp9KXzpB...|Toronto|2009-11-09 20:44:45|                   1068|            6633|
|Q9mA60HnY87C1TW5k...|Toronto|2010-08-29 01:34:42|                    604|            1376|
|O3pSxv1SyHpY4qi4Q...|Toronto|2010-07-04 02:52:05|                    538|            1353|
|gwIqbXEXijQNgdESV...|Toronto|2010-11-17 23:13:51|                    485|            1106|
|ic-tyi1jElL_umxZV...|Toronto|2014-02-19 22:37:38|                    462|            1893|
|0BBUmH7Krcax1RZgb...|Toronto|2010-03-18 14:40:42|                    441|            1182|
|TbhyP24zYZqZ2VJZg...|Toronto|2010-03-17 18:05:59|                    421|             862|
|iRQ_YKpCBdaCwvc2X...|Toronto|2014-03-29 17:21:25|                    419|      

Aparentemente os usuários fazem reviews em estabelecimentos não só em Toronto. Para incluir essa informação no modelo, será criada uma variável com a relação entre as quantidade de reviews do usuário na cidade pela quantidade total de reviews do usuário.

- Média de reviews por usuário na cidade e total

In [34]:
spark.sql('''
            SELECT AVG(city_review_counter_usr), 
             AVG(review_count_usr)
            
            FROM aux       
            
            ''').show()

+----------------------------+---------------------+
|avg(city_review_counter_usr)|avg(review_count_usr)|
+----------------------------+---------------------+
|           4.292004321988115|    36.68627568681593|
+----------------------------+---------------------+



- Remoção de usuários com apenas 1 review na cidade

In [35]:
base2 = spark.sql('''
            SELECT A.*,
            B.city_review_counter_usr,
            (B.city_review_counter_usr/B.review_count_usr) AS city_review_ratio_usr
            
            FROM base1 as A
            
            LEFT JOIN aux as B
            ON A.user_id = B.user_id
            
            WHERE B.city_review_counter_usr > 1
            
            ''')

In [36]:
base2.createOrReplaceTempView('base2')

In [37]:
#linhas na base de reviews + business + users
spark.sql('''
            SELECT Count(*)
            FROM base2
            ''').show()

+--------+
|count(1)|
+--------+
|  375687|
+--------+



- Classificação das avaliações em Boa (1 - maior do que 4) e Ruim ou inexistente (0 - menor do que 4).

In [38]:
base3 = spark.sql("""
        SELECT *,
        (CASE WHEN stars_rv >=4 THEN 1 ELSE 0 END) as class_rv,
        (CASE WHEN stars_bz >=4 THEN 1 ELSE 0 END) as class_bz,
        (CASE WHEN stars_usr >=4 THEN 1 ELSE 0 END) as class_usr
        
        FROM base2
        
        """)

In [39]:
base3.columns

['business_id',
 'cool_rv',
 'date_rv',
 'funny_rv',
 'review_id',
 'stars_rv',
 'text_rv',
 'useful_rv',
 'user_id',
 'address_bz',
 'categories_bz',
 'city_bz',
 'hours_bz',
 'is_open_bz',
 'latitude_bz',
 'longitude_bz',
 'name_bz',
 'postal_code_bz',
 'review_count_bz',
 'stars_bz',
 'state_bz',
 'stars_usr',
 'compliment_cool_usr',
 'compliment_cute_usr',
 'compliment_funny_usr',
 'compliment_hot_usr',
 'compliment_list_usr',
 'compliment_more_usr',
 'compliment_note_usr',
 'compliment_photos_usr',
 'compliment_plain_usr',
 'compliment_profile_usr',
 'compliment_writer_usr',
 'cool_usr',
 'elite_usr',
 'fans_usr',
 'friends_usr',
 'funny_usr',
 'name_usr',
 'review_count_usr',
 'useful_usr',
 'yelping_since_usr',
 'city_review_counter_usr',
 'city_review_ratio_usr',
 'class_rv',
 'class_bz',
 'class_usr']

In [40]:
base3.createOrReplaceTempView('base3')

In [41]:
spark.sql('''
            SELECT Count(*)
            FROM base3
            ''').show()

+--------+
|count(1)|
+--------+
|  375687|
+--------+



###   ((Reviews + Business) + Users ) + Tips

In [42]:
spark.sql('''
            SELECT business_id, user_id, 
            count(text) AS tips_counter,
            sum(compliment_count) as total_compliments
            
            FROM tp
            
            GROUP BY business_id, user_id
            ORDER BY total_compliments DESC
            ''').show()

+--------------------+--------------------+------------+-----------------+
|         business_id|             user_id|tips_counter|total_compliments|
+--------------------+--------------------+------------+-----------------+
|BQqwIYQuo2W94smjr...|mkbx55W8B8aPLgDqe...|          76|               65|
|OMRYQihVjqqzjoNoQ...|mkbx55W8B8aPLgDqe...|          73|               30|
|QsKhwKYB3YeWXqpIP...|mkbx55W8B8aPLgDqe...|          38|               30|
|QhXBIQWUmQxuVErdw...|mkbx55W8B8aPLgDqe...|          34|               30|
|55E0-qUHa7Kzqz8rO...|mkbx55W8B8aPLgDqe...|          45|               28|
|gWeYW0E5Tfmmj_9fu...|mkbx55W8B8aPLgDqe...|          14|               19|
|qpdMFF6Y31ZAtE1R1...|mkbx55W8B8aPLgDqe...|          16|               16|
|YILyHegzhy1vlc_LN...|mkbx55W8B8aPLgDqe...|          29|               16|
|0QzCeORfF8EY34UOD...|mkbx55W8B8aPLgDqe...|          42|               15|
|gwdQwe1JHLe-vPY9P...|wTfb2nfzPIyFcYQAr...|           1|               15|
|vMRWkz0wb3g4HjW_O...|tQP

In [43]:
base4 = spark.sql('''
            SELECT A.*,
            IFNULL(B.compliment_count, 0) AS compliment_count_tip,
            IFNULL(B.text,'') AS tip
            
            FROM base3 as A
            
            LEFT JOIN tp as B
            ON (A.user_id = B.user_id AND A.business_id = B.business_id)
            
            ''')

In [44]:
base4.select('business_id', 'user_id','tip','compliment_count_tip').show()

+--------------------+--------------------+--------------------+--------------------+
|         business_id|             user_id|                 tip|compliment_count_tip|
+--------------------+--------------------+--------------------+--------------------+
|BxCzy1WOVxOrr_G7V...|--Qh8yKWAvIP4V4K8...|                    |                   0|
|JgaQFKxW-Bnfc7r5E...|--Qh8yKWAvIP4V4K8...|                    |                   0|
|U_jPOTd-Z5KATbTYn...|-3PTUP443q6hQESLK...|                    |                   0|
|b6sUcJj_xbeYrKKOs...|-3PTUP443q6hQESLK...|                    |                   0|
|I_3P0ns6_3najPBxn...|-9RU4LuI_TfYgv9rB...|If you get gravy ...|                   0|
|ar3FKybOl9yazviTB...|-B4Cf2XLkPr9qMlLP...|                    |                   0|
|fe6YW5wLAeWQBATPf...|-EWgEX_dc1Xv83dVt...|                    |                   0|
|iGEvDk6hsizigmXhD...|-Fj7FUTxUy04imKQ8...|                    |                   0|
|nxbVyDfknWtycymHl...|-M-VBIeY_tzxtW6vC...|           

In [45]:
base4.select('text_rv','tip').show()

+--------------------+--------------------+
|             text_rv|                 tip|
+--------------------+--------------------+
|I came here for b...|                    |
|We recently had a...|                    |
|I got a small ste...|                    |
|- The only thing ...|                    |
|I have been comin...|If you get gravy ...|
|You really can't ...|                    |
|Cutest cafe in Qu...|                    |
|Probably one if m...|                    |
|My mom and I had ...|                    |
|I was impressed w...|                    |
|Service is good a...|Have the wings. T...|
|Horrible service....|                    |
|the shop is cute,...|                    |
|This place makes ...|                    |
|Came here and imm...|                    |
|Wow! I was in the...|                    |
|I do honestly wan...|                    |
|This little basem...|                    |
|Yummy is my fav g...|                    |
|I love the ambian...|          

In [46]:
base4.createOrReplaceTempView('base4')

In [47]:
spark.sql('''
            SELECT Count(*)
            FROM base4
            ''').show()

+--------+
|count(1)|
+--------+
|  380584|
+--------+



## Tratamento do Texto

In [48]:
def word_clean(sdf,col,new_col):
    rv1 = sdf.withColumn(new_col,f.regexp_replace(f.col(col), "'d", " would"))
    rv2 = rv1.withColumn(new_col,f.regexp_replace(f.col(new_col), "'ve", " have"))
    rv3 = rv2.withColumn(new_col,f.regexp_replace(f.col(new_col), "'s", " is"))
    rv4 = rv3.withColumn(new_col,f.regexp_replace(f.col(new_col), "'re", " are"))
    rv5 = rv4.withColumn(new_col,f.regexp_replace(f.col(new_col), "n't", " not"))
    rv6 = rv5.withColumn(new_col,f.regexp_replace(f.col(new_col), '\W+', " "))
    rv7 = rv6.withColumn(new_col,f.lower(f.col(new_col)))
    return rv7

In [49]:
base5 = word_clean(base4,'text_rv','text_clean')
base6 = word_clean(base5,'tip','tip_clean')

In [94]:
base6.select('text_clean','tip_clean').show()

+--------------------+--------------------+
|          text_clean|           tip_clean|
+--------------------+--------------------+
|i came here for b...|                    |
|we recently had a...|                    |
|i got a small ste...|                    |
| the only thing i...|                    |
|i have been comin...|if you get gravy ...|
|you really ca not...|                    |
|cutest cafe in qu...|                    |
|probably one if m...|                    |
|my mom and i had ...|                    |
|i was impressed w...|                    |
|service is good a...|have the wings th...|
|horrible service ...|                    |
|the shop is cute ...|                    |
|this place makes ...|                    |
|came here and imm...|                    |
|wow i was in the ...|                    |
|i do honestly wan...|                    |
|this little basem...|                    |
|yummy is my fav g...|                    |
|i love the ambian...|          

- Contagem de amigos de cada usuário

In [50]:
base7 = base6.withColumn('friends_counter_usr', f.size(f.split(f.col('friends_usr'),',')))

In [51]:
base7.createOrReplaceTempView('base7')

In [52]:
base8 = spark.sql('''
            SELECT *,
            (CASE WHEN friends_usr = 'None' THEN 0 ELSE friends_counter_usr END) as friends_count_usr
            FROM base7
            ''')

In [99]:
df = base8.select('friends_usr','friends_counter_usr','friends_count_usr').limit(10).toPandas()

In [100]:
df.dtypes

friends_usr            object
friends_counter_usr     int32
friends_count_usr       int32
dtype: object

In [53]:
df

Unnamed: 0,friends,friends_counter,friends_count
0,"8lfS8bZkamISfRfWim5CKw, yixPLGK98f9uH1_QTRbwvg...",87,87
1,"8lfS8bZkamISfRfWim5CKw, yixPLGK98f9uH1_QTRbwvg...",87,87
2,"ALxRHdxbefyWvZ-7nbmy7Q, efZ2H4mRruo4STvSCPPqYA...",27,27
3,"ALxRHdxbefyWvZ-7nbmy7Q, efZ2H4mRruo4STvSCPPqYA...",27,27
4,"-A0mo_vdbn_-_omovpr3HA, 4jh8x7Xv6aKAiZmBRlV8OA...",57,57
5,,1,0
6,"hD3xxj6CyunXze3sCAnDrw, QV8xDVk5AYZEpiOTI6_E0g...",73,73
7,"5e-12hnuO5qK8loS9D8o5Q, ahHKqP2-5xx2QlcJDAc3qQ...",205,205
8,"2DNajYvAMz5iV6Dma2K-MQ, avmumBw9vqdXLrBLfb1KZQ...",3,3
9,"wVvjv0lOGFXvpXdNfA2WHA, lGKDN8lSn6RhTkKzQ8wwig...",11,11


In [101]:
base8.select('friends_usr','friends_counter_usr','friends_count_usr').show()

+--------------------+-------------------+-----------------+
|         friends_usr|friends_counter_usr|friends_count_usr|
+--------------------+-------------------+-----------------+
|8lfS8bZkamISfRfWi...|                 87|               87|
|8lfS8bZkamISfRfWi...|                 87|               87|
|ALxRHdxbefyWvZ-7n...|                 27|               27|
|ALxRHdxbefyWvZ-7n...|                 27|               27|
|-A0mo_vdbn_-_omov...|                 57|               57|
|                None|                  1|                0|
|hD3xxj6CyunXze3sC...|                 73|               73|
|5e-12hnuO5qK8loS9...|                205|              205|
|wVvjv0lOGFXvpXdNf...|                 11|               11|
|                None|                  1|                0|
|LI86K5NbwF0KDgi0_...|                  1|                1|
|Ufw2jb2JCJELgCkxG...|                  2|                2|
|Pp7ksD7Ccb-dmxgyW...|                 85|               85|
|ZdpK8Jw_2Xi9I0X9F...|  

## Concatenando Comentários por Usuário - Review + Tips

In [103]:
base9 = base8.withColumn('rv_tip', f.concat(f.col('text_clean'),f.lit(' '), f.col('tip_clean')))

In [105]:
base9.select('text_clean','tip_clean','rv_tip','stars_rv','compliment_count_tip','funny_rv','cool_rv').show()

+--------------------+--------------------+--------------------+--------+--------------------+--------+-------+
|          text_clean|           tip_clean|              rv_tip|stars_rv|compliment_count_tip|funny_rv|cool_rv|
+--------------------+--------------------+--------------------+--------+--------------------+--------+-------+
|i came here for b...|                    |i came here for b...|     3.0|                   0|       0|      2|
|we recently had a...|                    |we recently had a...|     2.0|                   0|       1|      0|
|i got a small ste...|                    |i got a small ste...|     5.0|                   0|       0|      0|
| the only thing i...|                    | the only thing i...|     4.0|                   0|       0|      1|
|i have been comin...|if you get gravy ...|i have been comin...|     5.0|                   0|       0|      1|
|you really ca not...|                    |you really ca not...|     4.0|                   0|       0| 

In [107]:
base9.createOrReplaceTempView('base9')

In [108]:
spark.sql('''
        SELECT stars_rv, count(tip_clean) as tip_counter
        FROM base9
        GROUP BY stars_rv
        ORDER BY tip_counter DESC
        ''').show()

+--------+-----------+
|stars_rv|tip_counter|
+--------+-----------+
|     4.0|     131667|
|     5.0|     112126|
|     3.0|      71577|
|     2.0|      37246|
|     1.0|      27968|
+--------+-----------+



- Remoção de colunas que não serão utilizadas na primeira modelagem

In [None]:
base_final = base9.drop('friends_usr','friends_counter_usr','name_usr','city_bz', 'address_bz','state_bz', 'hours_bz','tip','tip_clean','elite_usr')#,'review_id')

In [None]:
base_final.columns

## Salva Base analítica em CSV

In [136]:
base_final.write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp.csv')

# Base para Modelo de Tópicos

Informações de Texto que serão tratadas em Modelos de Tópicos no R

In [114]:
words = base_final.select('review_id','user_id','business_id','categories_bz','stars_rv','rv_tip')

In [116]:
words2 = words.withColumn('category_bz', f.explode(f.split(f.col('categories_bz'),', ')))

In [117]:
words3 = words2.drop('categories_bz')

In [118]:
words3.show()

+--------------------+--------------------+--------------------+--------+--------------------+--------------------+
|           review_id|             user_id|         business_id|stars_rv|              rv_tip|         category_bz|
+--------------------+--------------------+--------------------+--------+--------------------+--------------------+
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|     3.0|i came here for b...|        Comfort Food|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|     3.0|i came here for b...|          Tapas Bars|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|     3.0|i came here for b...|               Vegan|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|     3.0|i came here for b...|  Breakfast & Brunch|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|     3.0|i came here for b...|         Gluten-Free|
|Xx0-_hoLgjsEanmKX...|--Qh8yKWAvIP4V4K8...|BxCzy1WOVxOrr_G7V...|     3.0

In [None]:
#words4 = words3.withColumn('word', f.explode(f.split(f.col('review_tip'),' ')))

## Salva Base Auxiliar para Modelo de Tópicos - "Reviews + Tips"

In [120]:
words3.write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp_words.csv')

# Matriz de distâncias

Estruturação de Dados para Clusterização Hierárquica

- Preparação para criação de matriz de distâncias baseada na nota de cada avaliação.

In [122]:
dist1 = base_final.select('review','user_id','business_id','categories_bz','stars_rv')

In [123]:
dist1.show()

+--------------------+--------------------+--------+
|             user_id|       categories_bz|stars_rv|
+--------------------+--------------------+--------+
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|
|--Qh8yKWAvIP4V4K8...|Bars, Pubs, Night...|     2.0|
|-3PTUP443q6hQESLK...|Tex-Mex, Mexican,...|     5.0|
|-3PTUP443q6hQESLK...|Bubble Tea, Juice...|     4.0|
|-9RU4LuI_TfYgv9rB...|Mediterranean, Pi...|     5.0|
|-B4Cf2XLkPr9qMlLP...|Japanese, Restaur...|     4.0|
|-EWgEX_dc1Xv83dVt...|Coffee & Tea, Caf...|     5.0|
|-Fj7FUTxUy04imKQ8...|Restaurants, Seaf...|     5.0|
|-M-VBIeY_tzxtW6vC...|Asian Fusion, Bre...|     1.0|
|-UAW240-h-P9-Zxat...| Restaurants, Korean|     2.0|
|-UHenwWwPB47tN28x...|American (Traditi...|     5.0|
|-VWzGBoycRE3_FOYt...|   Restaurants, Thai|     1.0|
|-d2daWmftYumOaYpb...|Ethiopian, Restau...|     3.0|
|-fB_TMhiaMF6zx7Wf...|Food, Bakeries, D...|     5.0|
|-ga7pQvnJcMB1_pIa...|Restaurants, Japa...|     1.0|
|-iRvRGqPEZWWOp2pk...|Burgers, Greek, R...|   

In [125]:
dist2 = dist1.withColumn('category_bz', f.explode(f.split(f.col('categories_bz'),', ')))

In [126]:
dist2.show()

+--------------------+--------------------+--------+--------------------+
|             user_id|       categories_bz|stars_rv|         category_bz|
+--------------------+--------------------+--------+--------------------+
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|        Comfort Food|
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|          Tapas Bars|
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|               Vegan|
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|  Breakfast & Brunch|
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|         Gluten-Free|
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|      Canadian (New)|
|--Qh8yKWAvIP4V4K8...|Comfort Food, Tap...|     3.0|         Restaurants|
|--Qh8yKWAvIP4V4K8...|Bars, Pubs, Night...|     2.0|                Bars|
|--Qh8yKWAvIP4V4K8...|Bars, Pubs, Night...|     2.0|                Pubs|
|--Qh8yKWAvIP4V4K8...|Bars, Pubs, Night...|     2.0|           Nightlife|
|--Qh8yKWAvIP4V4K8...|Bars, Pubs, Nigh

In [127]:
dist2.createOrReplaceTempView('dist')

- Quantidade de usuários e estabelecimentos

In [128]:
spark.sql('''
            SELECT Count(DISTINCT user_id)
            FROM dist
            ''').show()

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                  46638|
+-----------------------+



In [129]:
spark.sql('''
            SELECT Count(DISTINCT categories_bz)
            FROM dist
            ''').show()

+-----------------------------+
|count(DISTINCT categories_bz)|
+-----------------------------+
|                         3269|
+-----------------------------+



In [130]:
spark.sql('''
            SELECT Count(DISTINCT category_bz)
            FROM dist
            ''').show()

+---------------------------+
|count(DISTINCT category_bz)|
+---------------------------+
|                        328|
+---------------------------+



- Aumentando o limite máximo de coluna de acordo com o número de estabelecimentos

In [131]:
#spark.conf.set('spark.sql.pivotMaxValues', u'21000')

In [132]:
dist3 = dist2.groupBy("business_id").pivot("category_bz").mean("stars_rv")

In [133]:
dist4 = dist3.fillna(0)

In [134]:
dist4.show()

+--------------------+----------+-----------+-----------+-------------------+------+------------------+-----------------+----------------------+---------------+---------------+--------+-------+-------+---------+-----------+-------------+-----------+-------------+--------------------+------------------+----------+----------+------+--------+-----------+--------+------------------+------+-------------+----+--------+------------+---------+----------+-------+------------+-------+-----+----------+----------+---------+------------------+---------+--------+-------+----------+-------+-------+-------+-------+-----+---------+------------+---------+------------------+------------+----------------+---------+------------------+---------+-------+--------+--------------+------------+------------+------------+-------------+-------------------+------------------+--------------------+--------+------+-------------+------------------+---------------------+-----------------+---------+------------+----------

## Salva Base Auxiliar para Matriz de Distâncias - "Category"

In [135]:
dist4.write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp_dist.csv')

# Análise Gráfica

## Heatmap

- Criando mapa de calor da concentração de reviews

In [73]:
base_mapas = base_final#.limit(1000)

In [74]:
base_mapas.createOrReplaceTempView('base_mapas')

In [75]:
mapa1 = spark.sql("""
        SELECT latitude,
        longitude
        
        FROM base_mapas
        
        WHERE latitude is not null
        AND longitude is not null
        
        """)

In [76]:
mapa1.show(10)

+-------------+--------------+
|     latitude|     longitude|
+-------------+--------------+
|   43.6697687|    -79.382838|
|43.6386597113|   -79.3806966|
|43.6630940441|-79.3840069721|
|    43.656838|    -79.399237|
|43.6599496025| -79.479805281|
|   43.6547562|   -79.3874925|
|   43.6376269|    -79.393259|
|43.6543411559|-79.4004796073|
|   43.6721532|   -79.2903522|
|43.6729833023|-79.2866801843|
+-------------+--------------+
only showing top 10 rows



Decobrindo o ponto central de Latitude e Longetude do Mapa

In [77]:
spark.sql("""
        SELECT avg(latitude) as avg_lat,
        avg(longitude) as avg_long
        
        FROM base_mapas
        
        """).show()

+------------------+------------------+
|           avg_lat|          avg_long|
+------------------+------------------+
|43.671049211472905|-79.39089935548205|
+------------------+------------------+



In [124]:
import folium
from folium import plugins

mapa = folium.Map(location=[43.6732, -79.3919], 
                  zoom_start=11,
                  tiles='Stamen Toner')
# OpenStreetMap, Stamen Terrain, Stamen Toner

In [125]:
mapa

lat = mapa1.toPandas()['latitude'].values
lon = mapa1.toPandas()['longitude'].values

In [126]:
coordenadas = []
for la, lo in zip(lat, lon):
    coordenadas.append([la,lo])
mapa.add_child(plugins.HeatMap(coordenadas))

In [127]:
lat_lon3 = spark.sql("""
            SELECT 'ON' as state,
            (SUM(review_count) / (select SUM(review_count) from base_mapas))*100 as review_perc
            
            FROM base_mapas
        
            WHERE latitude is not null
            AND longitude is not null
        
            GROUP BY state
            
            """)

## Geo-Json do Canada - https://geojson-maps.ash.ms/

#url = 'https://raw.githubusercontent.com/AshKyd/geojson-regions/master/countries/110m/'
# state_geo = f'{url}/CAN.geojson'

url = 'https://raw.githubusercontent.com/jasonicarter/toronto-geojson/master/'
state_geo = f'{url}/toronto_crs84.geojson'

df = lat_lon3.toPandas()

m = folium.Map(location=[43, -79], zoom_start=10)
bins = list(df['review_perc'].quantile([0, 0.25, 0.5, 0.75, 1]))

folium.Choropleth(
    geo_data=state_geo,
    name='choropleth',
    data=df,
    columns=['state', 'review_perc'],
    key_on='feature.properties.name',
    fill_color='BuPu',
    fill_opacity=0.7,
    line_opacity=0.2,
    bins=bins,
    legend_name='Reviews (%)',
    reset=True
).add_to(m)

m