# Combinging the data for EDA
## 1. Importing and merging

### 1.1 Getting the json files and converting them to dataframes for a quick overview

In [1]:
import pandas as pd

In [2]:
# Importing Foursquare venues data
venues_data = pd.read_json("../data/Foursquare_venues_data.json")
venues_data.head()

Unnamed: 0,city,Accessories Store,Adult Boutique,Airport,Antique Shop,Aquarium,Arcade,Art Gallery,Arts & Crafts Store,Athletics & Sports,...,Watch Shop,Water Park,Waterfall,Waterfront,Wine Shop,Winery,Wings Joint,Yoga Studio,Zoo,Zoo Exhibit
0,Amsterdam,,,,,,1.0,,,,...,,,,,,,,2.0,,
1,Andorra la Vella,,,,,,,,,,...,,,,,,,,,,
2,Athens,,,,,,,1.0,,,...,,,,,,,,,,
3,Belgrade,,,,,,,,,,...,,,,,,,,,,
4,Berlin,,,,,,,2.0,1.0,,...,,,,,,,,,,


In [3]:
# Importing seasonal weather data
weather_data = pd.read_json("../data/Seasonal_weather_data.json", orient="columns")
weather_data.head()

Unnamed: 0,city,autumn_prec_mm,autumn_high,autumn_low,autumn_prec_days,autumn_sun_hrs,spring_prec_mm,spring_high,spring_low,spring_prec_days,...,summer_prec_mm,summer_high,summer_low,summer_prec_days,summer_sun_hrs,winter_prec_mm,winter_high,winter_low,winter_prec_days,winter_sun_hrs
0,Amsterdam,85,14,7,17.0,98.0,49,13,5,15.0,...,64,21,12,13.0,203.0,60,6,1,16.0,59.0
1,Athens,37,23,16,8.0,212.0,27,20,12,9.0,...,6,31,22,2.0,347.0,51,14,8,13.0,131.0
2,Belgrade,48,18,8,7.0,153.0,60,18,8,9.0,...,69,26,16,8.0,266.0,50,5,-1,8.0,74.0
3,Berlin,44,13,6,9.0,109.0,45,13,4,9.0,...,63,23,12,10.0,219.0,45,3,-2,10.0,55.0
4,Bern,79,13,5,9.0,119.0,87,13,3,12.0,...,113,23,11,11.0,213.0,64,3,-3,10.0,64.0


In [4]:
# Importing socio-economic data
soc_econ_data = pd.read_json("../data/Socio_economic_data.json")
soc_econ_data.head()

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life,climate_level,cost_of_living_level,health_care_level,pollution_level,property_income_ratio_level,purchasing_power_level,safety_level,traffic_time_level,quality_of_life_level
0,Amsterdam,87.45,84.18,69.45,30.79,10.98,81.63,67.32,29.88,168.38,Very High,Moderate,High,Low,Moderate,Moderate,High,Low,Very High
1,Andorra-La-Vella,,66.69,69.44,64.08,7.44,82.46,87.16,5.0,,,Moderate,High,High,Low,Moderate,Very High,Very Low,
2,Athens,95.18,59.28,56.17,57.3,12.75,40.69,50.49,37.98,119.84,Very High,Low,Moderate,Moderate,High,Very Low,Moderate,Moderate,High
3,Belgrade,84.14,40.49,53.69,63.57,22.22,34.87,62.02,35.89,107.89,Very High,Very Low,Moderate,High,Very High,Very Low,High,Moderate,Moderate
4,Berlin,83.35,67.41,69.68,39.45,9.63,98.54,58.92,34.06,164.83,Very High,Moderate,High,Low,Moderate,High,Moderate,Low,Very High


### 1.2 Checking dtypes to make sure all columns are correct

In [5]:
venues_data.dtypes

city                  object
Accessories Store    float64
Adult Boutique       float64
Airport              float64
Antique Shop         float64
                      ...   
Winery               float64
Wings Joint          float64
Yoga Studio          float64
Zoo                  float64
Zoo Exhibit          float64
Length: 267, dtype: object

In [6]:
weather_data.dtypes

city                 object
autumn_prec_mm        int64
autumn_high           int64
autumn_low            int64
autumn_prec_days    float64
autumn_sun_hrs      float64
spring_prec_mm        int64
spring_high           int64
spring_low            int64
spring_prec_days    float64
spring_sun_hrs      float64
summer_prec_mm        int64
summer_high           int64
summer_low            int64
summer_prec_days    float64
summer_sun_hrs      float64
winter_prec_mm        int64
winter_high           int64
winter_low            int64
winter_prec_days    float64
winter_sun_hrs      float64
dtype: object

In [7]:
soc_econ_data.dtypes

city                            object
climate                        float64
cost_of_living                 float64
health_care                    float64
pollution                      float64
property_income_ratio          float64
purchasing_power               float64
safety                         float64
traffic_time                   float64
quality_of_life                float64
climate_level                   object
cost_of_living_level            object
health_care_level               object
pollution_level                 object
property_income_ratio_level     object
purchasing_power_level          object
safety_level                    object
traffic_time_level              object
quality_of_life_level           object
dtype: object

## 2. Putting it all together
### 2.1 Using a merge on "city" with inner join, because right now we do not mind losing a few cities for our testing data.

In [8]:
all_data = venues_data.merge(weather_data).merge(soc_econ_data)

In [9]:
all_data.head()

Unnamed: 0,city,Accessories Store,Adult Boutique,Airport,Antique Shop,Aquarium,Arcade,Art Gallery,Arts & Crafts Store,Athletics & Sports,...,quality_of_life,climate_level,cost_of_living_level,health_care_level,pollution_level,property_income_ratio_level,purchasing_power_level,safety_level,traffic_time_level,quality_of_life_level
0,Amsterdam,,,,,,1.0,,,,...,168.38,Very High,Moderate,High,Low,Moderate,Moderate,High,Low,Very High
1,Athens,,,,,,,1.0,,,...,119.84,Very High,Low,Moderate,Moderate,High,Very Low,Moderate,Moderate,High
2,Belgrade,,,,,,,,,,...,107.89,Very High,Very Low,Moderate,High,Very High,Very Low,High,Moderate,Moderate
3,Berlin,,,,,,,2.0,1.0,,...,164.83,Very High,Moderate,High,Low,Moderate,High,Moderate,Low,Very High
4,Bratislava,,,,,,,1.0,,,...,147.54,Very High,Low,Moderate,Moderate,High,Low,High,Low,High


In [10]:
print(all_data.columns)

Index(['city', 'Accessories Store', 'Adult Boutique', 'Airport',
       'Antique Shop', 'Aquarium', 'Arcade', 'Art Gallery',
       'Arts & Crafts Store', 'Athletics & Sports',
       ...
       'quality_of_life', 'climate_level', 'cost_of_living_level',
       'health_care_level', 'pollution_level', 'property_income_ratio_level',
       'purchasing_power_level', 'safety_level', 'traffic_time_level',
       'quality_of_life_level'],
      dtype='object', length=305)


### 2.2 Changing Venues data back to 0's instead of NaN

In [11]:
# We have 268 columns that need zeroes instead of NaNs
venues_data.shape

(46, 267)

In [12]:
venue_cols = venues_data.columns

all_data[venue_cols] = all_data[venue_cols].fillna(0)

In [13]:
all_data.head()

Unnamed: 0,city,Accessories Store,Adult Boutique,Airport,Antique Shop,Aquarium,Arcade,Art Gallery,Arts & Crafts Store,Athletics & Sports,...,quality_of_life,climate_level,cost_of_living_level,health_care_level,pollution_level,property_income_ratio_level,purchasing_power_level,safety_level,traffic_time_level,quality_of_life_level
0,Amsterdam,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,168.38,Very High,Moderate,High,Low,Moderate,Moderate,High,Low,Very High
1,Athens,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,119.84,Very High,Low,Moderate,Moderate,High,Very Low,Moderate,Moderate,High
2,Belgrade,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,107.89,Very High,Very Low,Moderate,High,Very High,Very Low,High,Moderate,Moderate
3,Berlin,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,...,164.83,Very High,Moderate,High,Low,Moderate,High,Moderate,Low,Very High
4,Bratislava,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,147.54,Very High,Low,Moderate,Moderate,High,Low,High,Low,High


### 2.3 Exporting as json

In [14]:
all_data.shape

(35, 305)

In [15]:
all_data.dtypes

city                            object
Accessories Store              float64
Adult Boutique                 float64
Airport                        float64
Antique Shop                   float64
                                ...   
property_income_ratio_level     object
purchasing_power_level          object
safety_level                    object
traffic_time_level              object
quality_of_life_level           object
Length: 305, dtype: object

In [16]:
# all_data.to_json("../data/Combined_data.json")