# Atividade Integradora

In [1]:
import findspark as fs

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.ml.feature import StopWordsRemover
import pandas as pd
import seaborn as sns
sns.set(style="ticks", palette="pastel")
import os
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
spark_location='/Users/vivi/server/spark' # Set your own
java8_location= '/Library/Java/JavaVirtualMachines/jdk1.8.0_251.jdk/Contents/Home/' # Set your own
os.environ['JAVA_HOME'] = java8_location
fs.init(spark_home=spark_location)

In [4]:
datapath = 'data'

In [5]:
files = sorted(os.listdir(datapath))

In [6]:
files

['.DS_Store',
 'dataset list.rtf',
 'yelp_academic_dataset_business.json',
 'yelp_academic_dataset_checkin.json',
 'yelp_academic_dataset_review.json',
 'yelp_academic_dataset_tip.json',
 'yelp_academic_dataset_user.json']

In [7]:
!head data/yelp_academic_dataset_review.json

{"review_id":"xQY8N_XvtGbearJ5X4QryQ","user_id":"OwjRMXRC0KyPrIlcjaXeFQ","business_id":"-MhfebM0QIsKt87iDN-FNw","stars":2.0,"useful":5,"funny":0,"cool":0,"text":"As someone who has worked with many museums, I was eager to visit this gallery on my most recent trip to Las Vegas. When I saw they would be showing infamous eggs of the House of Faberge from the Virginia Museum of Fine Arts (VMFA), I knew I had to go!\n\nTucked away near the gelateria and the garden, the Gallery is pretty much hidden from view. It's what real estate agents would call \"cozy\" or \"charming\" - basically any euphemism for small.\n\nThat being said, you can still see wonderful art at a gallery of any size, so why the two *s you ask? Let me tell you:\n\n* pricing for this, while relatively inexpensive for a Las Vegas attraction, is completely over the top. For the space and the amount of art you can fit in there, it is a bit much.\n* it's not kid friendly at all. Seriously, don't bring them.\n* the security is n

In [8]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Integradora Yelp') \
    .config("spark.ui.port", "4060") \
    .getOrCreate()

In [9]:
sc = spark.sparkContext

In [10]:
spark#.stop()

## Bases

In [11]:
usr_raw = spark.read.json(datapath+'/yelp_academic_dataset_user.json')

In [12]:
rv_raw = spark.read.json(datapath+'/yelp_academic_dataset_review.json')

In [13]:
bz_raw = spark.read.json(datapath+'/yelp_academic_dataset_business.json')

In [14]:
bz_raw.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [15]:
bz_raw.createOrReplaceTempView('bz')
rv_raw.createOrReplaceTempView('rv')
usr_raw.createOrReplaceTempView('usr')

In [16]:
print(spark.catalog.listTables())

[Table(name='bz', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='rv', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='usr', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [17]:
bz_raw.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [18]:
usr_raw.columns

['average_stars',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool',
 'elite',
 'fans',
 'friends',
 'funny',
 'name',
 'review_count',
 'useful',
 'user_id',
 'yelping_since']

In [19]:
rv_raw.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

## Joins

Juntando as informações de reviews, estabelecimentos da cidade escolhidas e usuários que frequentam esses estabelecimentos.

In [20]:
base = spark.sql("""
        SELECT A.*,
        B.address,
        B.categories,
        B.city,
        B.hours,
        B.is_open,
        B.latitude,
        B.longitude,
        B.name AS name_bz,
        B.postal_code,
        B.review_count,
        B.stars AS stars_bz,
        B.state

        FROM rv as A 
        LEFT JOIN bz as B
        ON A.business_id = B.business_id
        
        WHERE b.city = 'Toronto'
        AND b.state = 'ON'
        
        """)

In [25]:
base.show(5)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+--------------------+----------+--------------------+--------------------+-------+--------------------+-------+----------+-----------+--------------------+-----------+------------+-----+-----+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|             address|attributes|         business_id|          categories|   city|               hours|is_open|  latitude|  longitude|                name|postal_code|review_count|stars|state|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+--------------------+----------+--------------------+--------------------+-------+--------------------+-------+----------+-----------+--------------------+-----------+------------+-----+-----+
|-6c_bJblLXUwoWfmY...|   0|2017-11

In [21]:
base.createOrReplaceTempView('base')

- Contagem da quantidade de linhas para garantir que a integridade do dataset ser mantém ao longo do processamento.

In [22]:
#linhas na base de reviews + business
spark.sql('''
            SELECT Count(*)
            FROM base
            ''').show()

+--------+
|count(1)|
+--------+
|  600555|
+--------+



In [23]:
base1 = spark.sql("""
        SELECT A.*,
        B.average_stars AS stars_usr,
        B.compliment_cool,
        B.compliment_cute,
        B.compliment_funny,
        B.compliment_hot,
        B.compliment_list,
        B.compliment_more,
        B.compliment_note,
        B.compliment_photos,
        B.compliment_plain,
        B.compliment_profile,
        B.compliment_writer,
        B.cool AS cool_usr,
        B.elite AS elite_usr,
        B.fans,
        B.friends,
        B.funny AS funny_usr,
        B.name AS name_usr,
        B.review_count AS review_count_usr,
        B.useful AS useful_usr,
        B.yelping_since

        FROM base as A 
        LEFT JOIN usr as B
        ON A.user_id = B.user_id    
        """)

In [24]:
base1.createOrReplaceTempView('base1')

In [25]:
#linhas na base de reviews + business + users
spark.sql('''
            SELECT Count(*)
            FROM base1
            ''').show()

+--------+
|count(1)|
+--------+
|  600555|
+--------+



In [65]:
base1.show()

KeyboardInterrupt: 

- Classificação das avaliações em Boa (1 - maior do que 4) e Ruim ou inexistente (0 - menor do que 4).

In [26]:
base2 = spark.sql("""
        SELECT *,
        (CASE WHEN stars >=4 THEN 1 ELSE 0 END) as class_rv,
        (CASE WHEN stars_bz >=4 THEN 1 ELSE 0 END) as class_bz,
        (CASE WHEN stars_usr >=4 THEN 1 ELSE 0 END) as class_usr
        
        FROM base1 
        
        """)

In [27]:
base2.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id',
 'address',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name_bz',
 'postal_code',
 'review_count',
 'stars_bz',
 'state',
 'stars_usr',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool_usr',
 'elite_usr',
 'fans',
 'friends',
 'funny_usr',
 'name_usr',
 'review_count_usr',
 'useful_usr',
 'yelping_since',
 'class_rv',
 'class_bz',
 'class_usr']

In [28]:
base2.createOrReplaceTempView('base2')

In [29]:
spark.sql('''
            SELECT Count(*)
            FROM base2
            ''').show()

+--------+
|count(1)|
+--------+
|  600555|
+--------+



In [30]:
base2.select('text').show()

+--------------------+
|                text|
+--------------------+
|My friend and I s...|
|Relaxed understat...|
|This hostel is a ...|
|Horrible customer...|
|Horrible service....|
|My favourite food...|
|Solid place! Love...|
|Ah.... I was told...|
|Very nice staff! ...|
|What a disappoint...|
|Sorry Fugo.... I ...|
|I have been going...|
|The portions are ...|
|3.5 stars. I enjo...|
|I want to love th...|
|Solid place to go...|
|Evergreen is a re...|
|YUM! Love these c...|
|This place has a ...|
|Good. The first t...|
+--------------------+
only showing top 20 rows



- Tratamento do texto das reviews

In [31]:
def word_clean(sdf):
    rv1 = sdf.withColumn('text_clean',f.regexp_replace(f.col('text'), "'d", " would"))
    rv2 = rv1.withColumn('text_clean',f.regexp_replace(f.col('text_clean'), "'ve", " have"))
    rv3 = rv2.withColumn('text_clean',f.regexp_replace(f.col('text_clean'), "'s", " is"))
    rv4 = rv3.withColumn('text_clean',f.regexp_replace(f.col('text_clean'), "'re", " are"))
    rv5 = rv4.withColumn('text_clean',f.regexp_replace(f.col('text_clean'), '\W+', " "))
    rv6 = rv5.withColumn('text_clean',f.lower(f.col('text_clean')))
    return rv6

In [32]:
base3 = word_clean(base2)

In [33]:
base3.select('text_clean').show()

+--------------------+
|          text_clean|
+--------------------+
|my friend and i s...|
|relaxed understat...|
|this hostel is a ...|
|horrible customer...|
|horrible service ...|
|my favourite food...|
|solid place love ...|
|ah i was told the...|
|very nice staff g...|
|what a disappoint...|
|sorry fugo i was ...|
|i have been going...|
|the portions are ...|
|3 5 stars i enjoy...|
|i want to love th...|
|solid place to go...|
|evergreen is a re...|
|yum love these ch...|
|this place has a ...|
|good the first ti...|
+--------------------+
only showing top 20 rows



- Contagem de amigos de cada usuário

In [34]:
base4 = base3.withColumn('friends_counter', f.size(f.split(f.col('friends'),',')))

In [35]:
base4.createOrReplaceTempView('base4')

In [36]:
base5 = spark.sql('''
            SELECT *,
            (CASE WHEN friends = 'None' THEN 0 ELSE friends_counter END) as friends_count
            FROM base4
            ''')

In [37]:
df = base5.select('friends','friends_counter','friends_count').limit(10).toPandas()

In [38]:
df.dtypes

friends            object
friends_counter     int32
friends_count       int32
dtype: object

In [39]:
df

Unnamed: 0,friends,friends_counter,friends_count
0,kUWW9YR-2xC9YUSavBro8w,1,1
1,kUWW9YR-2xC9YUSavBro8w,1,1
2,,1,0
3,"nI1M9-fatJdgiSZ-v1H2YA, KA47Ih5vwcYjAVV2XEBwhg...",16,16
4,"714Mxso37tVsh33JRUGyUg, GJ3tKLqVEp4yUASpFegGBg...",15,15
5,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
6,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
7,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
8,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27
9,"CA1N8B5ZkSiUJeOXYcWImg, ngBDdwtpgtfFc9LNQT2Lzg...",27,27


- Remoção de colunas que não serão utilizadas na primeira modelagem

In [49]:
base6 = base5.drop('friends','friends_counter','name_usr','city', 'address','review_id','state', 'hours','text_clean','text','elite_usr')

In [50]:
base6.columns

['business_id',
 'cool',
 'date',
 'funny',
 'stars',
 'useful',
 'user_id',
 'categories',
 'is_open',
 'latitude',
 'longitude',
 'name_bz',
 'postal_code',
 'review_count',
 'stars_bz',
 'stars_usr',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool_usr',
 'fans',
 'funny_usr',
 'review_count_usr',
 'useful_usr',
 'yelping_since',
 'class_rv',
 'class_bz',
 'class_usr',
 'friends_count']

In [51]:
base6.limit(50000).write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp.csv')

# Matriz de distâncias

- Preparação para criação de matriz de distâncias baseada na nota de cada avaliação.

In [43]:
dist1 = base6.select('user_id','business_id','stars')

In [44]:
dist1.show()

+--------------------+--------------------+-----+
|             user_id|         business_id|stars|
+--------------------+--------------------+-----+
|-4Anvj46CWf57KWI9...|478TIlfHXfT3wvww5...|  3.0|
|-4Anvj46CWf57KWI9...|vKA9sIqBcW0UlTKGh...|  4.0|
|-897i_JdWyDsXGUa8...|Qy_gJU57rAsoV__qu...|  3.0|
|-BUamlG3H-7yqpAl1...|MlKNIbEM-JL9WesSd...|  1.0|
|-CGdueQKCHM_KnHxO...|Ze4VPogvcD7inc3Qu...|  1.0|
|-JBB4-ALR07J6Pbx4...|ZmnCN-U7mhuZtlWHR...|  5.0|
|-JBB4-ALR07J6Pbx4...|lVVJMvqu4LXL5rBqj...|  4.0|
|-JBB4-ALR07J6Pbx4...|RXaxKQJtExnRZX1in...|  1.0|
|-JBB4-ALR07J6Pbx4...|87I8ba2FKYKeXyAhi...|  4.0|
|-JBB4-ALR07J6Pbx4...|jL7snnRKUYWNfq1NL...|  1.0|
|-JBB4-ALR07J6Pbx4...|EK38MXW_OsC5CZVvI...|  2.0|
|-JBB4-ALR07J6Pbx4...|SxjNgkzAlUG-wt2rS...|  4.0|
|-JBB4-ALR07J6Pbx4...|lOKgoQtMhnlf6hWvr...|  4.0|
|-JBB4-ALR07J6Pbx4...|FQr_scgz9a4sXTdRS...|  3.0|
|-JBB4-ALR07J6Pbx4...|AFF2vsQmRT4L22AvU...|  3.0|
|-JBB4-ALR07J6Pbx4...|L82O1ZFFQfjJxF0_P...|  4.0|
|-JBB4-ALR07J6Pbx4...|wcZiDpDRcFnH3hhGX...|  4.0|


- Quantidade de usuários e estabelecimentos

In [45]:
spark.sql('''
            SELECT Count(DISTINCT user_id)
            FROM base4
            ''').show()

+-----------------------+
|count(DISTINCT user_id)|
+-----------------------+
|                 132328|
+-----------------------+



In [46]:
spark.sql('''
            SELECT Count(DISTINCT business_id)
            FROM base4
            ''').show()

+---------------------------+
|count(DISTINCT business_id)|
+---------------------------+
|                      20364|
+---------------------------+



- Aumentando o limite máximo de coluna de acordo com o número de estabelecimentos

In [55]:
spark.conf.set('spark.sql.pivotMaxValues', u'21000')

In [47]:
dist2 = dist1.limit(10000).groupBy("user_id").pivot("business_id").mean("stars")

In [48]:
dist2.write \
    .format('csv') \
    .mode('overwrite') \
    .option('sep', ',') \
    .option('header', True) \
    .save('output/yelp_dist.csv')