# Exploratory Data Analysis and Cleaning

### 1. Importing Libraries and data

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
data = pd.read_csv('all_thefork_scrapes.csv', index_col=0)

  data = pd.read_csv('all_thefork_scrapes.csv', index_col=0)


### 2. Summary Statistics

In [11]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
averagePrice,2415.0,4.014120e+01,4.291000e+01,9.000000e+00,1.700000e+01,2.500000e+01,4.500000e+01,3.500000e+02
latitude,2415.0,4.401005e+01,7.028186e+00,3.701935e+01,3.867652e+01,4.020331e+01,5.340000e+01,5.340000e+01
longitude,2415.0,-6.674209e+00,2.753072e+00,-9.381659e+00,-9.135404e+00,-8.247880e+00,-2.983333e+00,-2.983333e+00
maxPartySize,2112.0,5.026420e+01,2.777294e+01,4.000000e+00,3.000000e+01,6.000000e+01,6.000000e+01,3.700000e+02
phone,1464.0,2.917438e+11,1.372161e+11,3.314014e+10,3.512116e+11,3.512789e+11,3.519341e+11,4.478807e+11
...,...,...,...,...,...,...,...,...
reviewList/18/serviceRatingValue,1493.0,9.407904e+00,1.372994e+00,2.000000e+00,1.000000e+01,1.000000e+01,1.000000e+01,1.000000e+01
reviewList/19/ambienceRatingValue,1473.0,9.156823e+00,1.447631e+00,2.000000e+00,8.000000e+00,1.000000e+01,1.000000e+01,1.000000e+01
reviewList/19/foodRatingValue,1473.0,9.412084e+00,1.365049e+00,2.000000e+00,1.000000e+01,1.000000e+01,1.000000e+01,1.000000e+01
reviewList/19/ratingValue,1473.0,9.355737e+00,1.174946e+00,2.000000e+00,9.000000e+00,1.000000e+01,1.000000e+01,1.000000e+01


In [12]:
data.describe(include='object').T

Unnamed: 0,count,unique,top,freq
address,2415,1507,"Rua da Esperança - 42,1200-658,Lisboa",12
chefName,1197,577,Sanam Pokhrel,12
cuisine,2405,56,Portuguese,488
currency,2415,5,EUR,2164
customerPhotos/0,2298,1441,https://res.cloudinary.com/tf-lab/image/upload...,12
...,...,...,...,...
photos/101,1,1,https://res.cloudinary.com/tf-lab/image/upload...,1
photos/102,1,1,https://res.cloudinary.com/tf-lab/image/upload...,1
photos/103,1,1,https://res.cloudinary.com/tf-lab/image/upload...,1
photos/104,1,1,https://res.cloudinary.com/tf-lab/image/upload...,1


### 3. Handling Locations

In [16]:
locations = data[['name','address', 'location', 'longitude', 'latitude']].copy()

In [17]:
locations['location2'] = locations['address'].map(lambda x: str(x).split(',')[-1])
locations['location3'] = locations['location'].map(lambda x: str(x).split(',')[0])

In [27]:
locations[(locations['location2']) != (locations['location3'])]

Unnamed: 0,name,address,location,longitude,latitude,location2,location3
20,Passionné,"17 Rue Bergère,75009,Paris",,-2.983333,53.400002,Paris,
22,To Restaurant,"34 Rue Beaurepaire,75010,Paris",,-2.983333,53.400002,Paris,
24,Le Puits du Trésor,"21 Route des 4 Châteaux,11600,Lastours",,-2.983333,53.400002,Lastours,
25,Lienzo*,"Plaça de Tetuan, 18,46003,Valencia",,-2.983333,53.400002,Valencia,
28,Seven Park Place by William Drabble,"7-8 Park Place\r,SW1A 1LS,London",,-2.983333,53.400002,London,
...,...,...,...,...,...,...,...
96,RoseMar,"Via Zara, 1,30016,Lido di Jesolo",,-2.983333,53.400002,Lido di Jesolo,
97,G Asian Canteen,"18-20 High Road,NW10 2QD,London",,-2.983333,53.400002,London,
99,La Bicicleta,"La Pl., 12,39716,Hoznayo, Cantabria",,-2.983333,53.400002,Cantabria,
100,Vinmakaren,"Regeringsgatan 111A,111 39,Stockholm",,-2.983333,53.400002,Stockholm,


There are a few inconsistent entries in terms of locations, as well as missing values due to not belonging to Portugal (original extraction country)

### 4. Handling Missing Values

In [38]:
null_columns = data.isnull().any()
null_columns = null_columns[null_columns].index
list(null_columns)

['chefName',
 'cuisine',
 'customerPhotos/0',
 'customerPhotos/1',
 'customerPhotos/2',
 'customerPhotos/3',
 'customerPhotos/4',
 'customerPhotos/5',
 'customerPhotos/6',
 'customerPhotos/7',
 'customerPhotos/8',
 'customerPhotos/9',
 'customerPhotos/10',
 'customerPhotos/11',
 'customerPhotos/12',
 'customerPhotos/13',
 'customerPhotos/14',
 'customerPhotos/15',
 'customerPhotos/16',
 'customerPhotos/17',
 'customerPhotos/18',
 'customerPhotos/19',
 'description',
 'location',
 'maxPartySize',
 'offer',
 'openingHours',
 'paymentAccepted/0',
 'paymentAccepted/1',
 'paymentAccepted/2',
 'paymentAccepted/3',
 'paymentAccepted/4',
 'paymentAccepted/5',
 'paymentAccepted/6',
 'paymentAccepted/7',
 'paymentAccepted/8',
 'phone',
 'photos/1',
 'photos/2',
 'photos/3',
 'photos/4',
 'photos/5',
 'photos/6',
 'photos/7',
 'photos/8',
 'photos/9',
 'photos/10',
 'photos/11',
 'photos/12',
 'photos/13',
 'photos/14',
 'photos/15',
 'photos/16',
 'photos/17',
 'photos/18',
 'photos/19',
 'photo

In [44]:
data['offer'] = np.where(data['offer'].isnull(), 'No Offer', data['offer'])

In [45]:
data['chefName'] = np.where(data['chefName'].isnull(), 'Not Applicable', data['chefName'])

In [50]:
data.groupby('location')['ratingValue'].agg(['min', 'max', 'mean', 'median'])

Unnamed: 0_level_0,min,max,mean,median
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Albufeira, Portugal",4.0,9.6,8.889744,9.0
"Almada, Portugal",6.0,9.7,8.864798,9.0
"Aveiro, Portugal",6.9,9.6,8.940625,9.0
"Braga, Portugal",6.5,9.8,9.085915,9.2
"Bragança, Portugal",9.2,9.6,9.4,9.4
"Caldas da Rainha, Portugal",8.1,9.7,8.77619,8.8
Castelo Branco,9.2,9.2,9.2,9.2
"Coimbra, Portugal",7.0,9.7,8.952174,9.0
"Faro, Portugal",7.5,10.0,8.989655,9.1
"Fundão, Portugal",4.7,9.8,8.758824,9.1


In [47]:
data['ratingValue'].describe()

count    2365.000000
mean        9.129556
std         0.542439
min         4.000000
25%         8.900000
50%         9.200000
75%         9.500000
max        10.000000
Name: ratingValue, dtype: float64

### 5. Dealing with Schedule Format

In [53]:
def clean_openinghours(observation):
    opening_hours_dict = {}
    for day in str(observation).split('\r\n'):
        day_week = f'{day.partition("y")[0]}y'
        opening_hours_dict[day_week] = day.partition("y")[2].strip()
    return opening_hours_dict


In [54]:
data['schedule'] = data['openingHours'].apply(lambda x: clean_openinghours(x))

In [57]:
data['reviewList/10/review']

0                                                    NaN
1         Staff atencioso e simpático.\nComida saborosa.
2      I went to Odessa with my family because I love...
3      Food was excellent, the sauces were amazing. B...
4                                                    NaN
                             ...                        
187    Platos de bacalao muy ricos. Los postres estab...
188                                                  NaN
189    Faltou música. A presença do Felipe foi excele...
190    A quiet gem in the heart of Porto! Delicious a...
191                                            Muito bom
Name: reviewList/10/review, Length: 2415, dtype: object