In [3]:
%run ../common-imports.ipynb


# Tidy Data with Pandas

In [4]:

# Reading the csv files into a pandas data frame

temperature = pd.read_csv("../../datasets/temperature.csv")
humidity = pd.read_csv("../../datasets/humidity.csv")
wind_speed = pd.read_csv("../../datasets/wind_speed.csv")

temperature.head()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,309.1,,,
1,2012-10-01 13:00:00,284.63,282.08,289.48,281.8,291.87,291.53,293.41,296.6,285.12,...,285.63,288.22,285.83,287.17,307.59,305.47,310.58,304.4,304.4,303.5
2,2012-10-01 14:00:00,284.629041,282.083252,289.474993,281.797217,291.868186,291.533501,293.403141,296.608509,285.154558,...,285.663208,288.247676,285.83465,287.186092,307.59,304.31,310.495769,304.4,304.4,303.5
3,2012-10-01 15:00:00,284.626998,282.091866,289.460618,281.789833,291.862844,291.543355,293.392177,296.631487,285.233952,...,285.756824,288.32694,285.84779,287.231672,307.391513,304.281841,310.411538,304.4,304.4,303.5
4,2012-10-01 16:00:00,284.624955,282.100481,289.446243,281.782449,291.857503,291.553209,293.381213,296.654466,285.313345,...,285.85044,288.406203,285.860929,287.277251,307.1452,304.238015,310.327308,304.4,304.4,303.5


In [5]:
# Importing the libraries
import pandas as pd
import numpy as np# Displaying the first 5 rows of the data frame

temperature.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
datetime,45253.0,45253.0,2012-10-01 12:00:00,1.0,,,,,,,
Vancouver,44458.0,,,,283.862654,6.640131,245.15,279.16,283.45,288.600785,307.0
Portland,45252.0,,,,284.992929,7.452438,262.37,279.85,284.32,289.45175,312.52
San Francisco,44460.0,,,,288.155821,5.332862,272.3,284.67,287.61,291.015167,313.62
Seattle,45250.0,,,,284.409626,6.547986,263.78,279.83,283.94,288.53,307.3
Los Angeles,45250.0,,,,290.846116,6.460823,266.503667,286.38,290.53,295.08,315.47
San Diego,45252.0,,,,290.215044,5.889992,265.783333,286.25475,290.11875,294.107542,313.36
Las Vegas,45252.0,,,,292.424887,10.829522,260.561333,283.92,292.027486,300.835,318.64
Phoenix,45250.0,,,,295.493358,9.916743,266.059,287.68,295.586667,303.05,321.22
Albuquerque,45252.0,,,,285.617856,9.853484,255.042333,277.97,286.12,292.835643,312.71


# Data Manipulation

Let us unpivot, or melt: convert from wide format to long format, as tidy-data thinking recommends.

Tidy data essentially says:
    - Each row should be an observation
    - Each column should be a variable. Roughly, each column that is not an identifier or dimension should be a measure.
    - A dataframe should represent a logical unit of observables

In [6]:
tidy_temperature = pd.melt(temperature, 
                           id_vars="datetime", 
                           var_name="city", value_name="temperature")

In [7]:
tidy_temperature.describe(include='all')

Unnamed: 0,datetime,city,temperature
count,1629108,1629108,1621078.0
unique,45253,36,
top,2012-10-01 12:00:00,Vancouver,
freq,36,45253,
mean,,,288.5958
std,,,10.35149
min,,,242.3367
25%,,,281.884
50%,,,289.58
75%,,,296.25


In [8]:
tidy_temperature.head()

Unnamed: 0,datetime,city,temperature
0,2012-10-01 12:00:00,Vancouver,
1,2012-10-01 13:00:00,Vancouver,284.63
2,2012-10-01 14:00:00,Vancouver,284.629041
3,2012-10-01 15:00:00,Vancouver,284.626998
4,2012-10-01 16:00:00,Vancouver,284.624955


In [9]:
tidy_temperature.sample(20)

Unnamed: 0,datetime,city,temperature
235398,2013-10-17 01:00:00,San Diego,297.34
648783,2014-06-28 13:00:00,Minneapolis,297.76
437353,2016-03-07 16:00:00,Denver,281.436348
685210,2013-06-25 19:00:00,Saint Louis,305.4
140608,2013-04-21 13:00:00,Seattle,280.8
771417,2012-12-28 16:00:00,Nashville,274.28
90740,2012-10-11 06:00:00,San Francisco,284.56
547338,2013-03-29 18:00:00,Houston,295.02
958165,2013-08-24 16:00:00,Jacksonville,301.49
174092,2017-02-14 17:00:00,Seattle,277.153


In [10]:
tidy_humidity = pd.melt(humidity, 
                           id_vars="datetime", 
                           var_name="city", value_name="humidity")

In [11]:
tidy_windspeed = pd.melt(wind_speed, 
                           id_vars="datetime", 
                           var_name="city", value_name="wind_speed")

In [12]:
raw_weather = tidy_temperature\
        .join(tidy_humidity.set_index(['datetime', 'city']), on=['datetime', 'city'])\
        .join(tidy_windspeed.set_index(['datetime', 'city']), on=['datetime', 'city'])
raw_weather.sample(20)

Unnamed: 0,datetime,city,temperature,humidity,wind_speed
1312712,2012-10-17 03:00:00,Boston,290.59,88.0,5.0
629111,2017-05-29 10:00:00,Kansas City,287.13,82.0,2.0
720771,2017-07-16 12:00:00,Saint Louis,290.96,100.0,2.0
344101,2015-11-14 06:00:00,Phoenix,287.174662,28.0,1.0
259074,2016-06-29 13:00:00,San Diego,292.03,88.0,2.0
772054,2013-01-24 05:00:00,Nashville,270.73,36.0,2.0
126683,2016-11-16 21:00:00,San Francisco,289.84,44.0,9.0
146124,2013-12-07 09:00:00,Seattle,267.7,46.0,1.0
680174,2012-11-27 23:00:00,Saint Louis,277.91,41.0,7.0
1027232,2016-05-12 22:00:00,Charlotte,298.09,45.0,3.0


# Let's cleanup the data 
There are many strategies to deal with NaN data. Here, since it is weather, perhaps a reasonable way would be interpolating the temperature, humidity and wind_speed. In other words, the tempeture today is reasonable between that of yesterday and tomorrow, as a good approximation.



In [13]:
raw_weather.describe()


Unnamed: 0,temperature,humidity,wind_speed
count,1621078.0,1600457.0,1621115.0
mean,288.5958,68.13581,2.790038
std,10.35149,22.44515,2.092585
min,242.3367,5.0,0.0
25%,281.884,53.0,1.0
50%,289.58,72.0,2.0
75%,296.25,87.0,4.0
max,321.22,100.0,50.0


In [14]:
# The amount of missing values
raw_weather.isna().sum()

datetime           0
city               0
temperature     8030
humidity       28651
wind_speed      7993
dtype: int64

In [15]:
weather = raw_weather.interpolate()

In [16]:
weather.isna().sum()

datetime       0
city           0
temperature    1
humidity       1
wind_speed     1
dtype: int64

This is because we could not interpolate into the first row! Therefore, let us omit it.


In [17]:
weather = weather.dropna()
weather.isna().sum()

datetime       0
city           0
temperature    0
humidity       0
wind_speed     0
dtype: int64

In [18]:
# Filter down to only San Francisco weather
sf_weather = weather[weather['city'] == 'San Francisco']
sf_weather.sample(10)

Unnamed: 0,datetime,city,temperature,humidity,wind_speed
130306,2017-04-16 20:00:00,San Francisco,288.15,66.0,5.0
108949,2014-11-08 23:00:00,San Francisco,293.224,70.0,1.0
128666,2017-02-07 12:00:00,San Francisco,288.56,82.0,15.0
95248,2013-04-17 02:00:00,San Francisco,284.72,70.0,10.0
134092,2017-09-21 14:00:00,San Francisco,286.85,88.0,5.0
92351,2012-12-17 09:00:00,San Francisco,280.71,76.0,3.0
104843,2014-05-21 21:00:00,San Francisco,295.76,55.0,3.0
95827,2013-05-11 05:00:00,San Francisco,285.1,77.0,2.0
124597,2016-08-21 23:00:00,San Francisco,297.97,63.0,9.0
126071,2016-10-22 09:00:00,San Francisco,284.205,98.0,1.0


In [19]:
# Project down to only temperature and humidity
data = weather[['datetime', 'city','temperature', 'humidity']]
data.sample(10)

Unnamed: 0,datetime,city,temperature,humidity
336989,2015-01-21 22:00:00,Phoenix,292.561,44.0
1538154,2017-11-11 09:00:00,Haifa,301.555014,76.447103
207333,2015-10-03 05:00:00,Los Angeles,292.57,73.0
155665,2015-01-08 22:00:00,Seattle,279.962333,98.0
164550,2016-01-14 03:00:00,Seattle,279.212966,91.0
703171,2015-07-14 04:00:00,Saint Louis,302.38,74.0
805484,2016-11-17 03:00:00,Nashville,285.47,66.0
1290243,2015-05-24 11:00:00,Montreal,289.383,58.0
14322,2014-05-21 06:00:00,Vancouver,287.61,70.0
1437209,2016-09-02 10:00:00,Tel Aviv District,304.15,48.0


In [20]:
# The average weather for each city
means = weather.groupby('city')['temperature', 'humidity', 'wind_speed'].mean()

means.columns = ['mean_temperature', 'mean_humidity', 'mean_speed']
means.sample(10)

Unnamed: 0_level_0,mean_temperature,mean_humidity,mean_speed
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Nashville,288.566327,68.196462,2.42406
Portland,284.992865,74.670066,2.050693
Phoenix,295.492664,37.118467,1.909133
Vancouver,283.883179,81.87311,2.425263
Toronto,281.941258,76.281805,3.830741
Indianapolis,284.773522,72.371578,3.225344
Albuquerque,285.617978,44.912779,2.76396
Miami,298.0265,75.509348,3.232349
Pittsburgh,284.053577,70.320099,2.511347
San Francisco,288.222135,76.39571,2.755486


Note the two-levels of the columns above. Let us now flatten the data:


In [21]:
means = means.reset_index()
means.head()

Unnamed: 0,city,mean_temperature,mean_humidity,mean_speed
0,Albuquerque,285.617978,44.912779,2.76396
1,Atlanta,289.773667,70.728018,2.230482
2,Beersheba,291.594656,70.409056,1.963273
3,Boston,283.779142,77.29745,3.380892
4,Charlotte,288.897054,70.190717,2.381654


Now, let's join it back with the original data


In [22]:
means.sample(10)

Unnamed: 0,city,mean_temperature,mean_humidity,mean_speed
13,Jacksonville,294.333226,76.37845,2.831724
21,Nahariyya,294.222662,78.496486,2.997569
20,Montreal,280.342729,71.840087,3.831901
31,San Francisco,288.222135,76.39571,2.755486
32,Seattle,284.410297,77.098732,2.11807
25,Phoenix,295.492664,37.118467,1.909133
4,Charlotte,288.897054,70.190717,2.381654
16,Las Vegas,292.424887,31.634046,2.46398
26,Pittsburgh,284.053577,70.320099,2.511347
11,Houston,294.204078,74.013292,2.9625


In [23]:
df = means.set_index('city')

Therefore, now we can remove the unnecessary column city.

In [24]:
means_data = means.drop(['city'], axis=1)
means_data.sample(10)

Unnamed: 0,mean_temperature,mean_humidity,mean_speed
29,293.785501,67.557819,3.022175
3,283.779142,77.29745,3.380892
34,281.941258,76.281805,3.830741
17,290.845703,62.68464,1.219566
10,295.381757,79.67769,3.267949
30,290.215111,67.72288,1.751431
28,286.675846,70.528308,3.048218
9,296.518344,53.453438,3.418392
21,294.222662,78.496486,2.997569
0,285.617978,44.912779,2.76396


In [25]:
cor = means_data.corr()
cor

Unnamed: 0,mean_temperature,mean_humidity,mean_speed
mean_temperature,1.0,-0.209743,-0.221668
mean_humidity,-0.209743,1.0,0.240435
mean_speed,-0.221668,0.240435,1.0


In [26]:
data = weather.merge(means, left_on='city', right_on='city')

In [27]:
data.sample(10)

Unnamed: 0,datetime,city,temperature,humidity,wind_speed,mean_temperature,mean_humidity,mean_speed
888791,2016-01-22 05:00:00,Atlanta,279.267491,98.0,5.0,289.773667,70.728018,2.230482
161103,2015-08-23 13:00:00,Seattle,284.7,87.0,1.0,284.410297,77.098732,2.11807
1037656,2017-07-21 07:00:00,Charlotte,296.49,94.0,2.0,288.897054,70.190717,2.381654
785855,2014-08-22 07:00:00,Nashville,298.032333,78.0,2.0,288.566327,68.196462,2.42406
1494763,2012-11-29 11:00:00,Haifa,293.85,43.0,2.0,295.381757,79.67769,3.267949
1298261,2016-04-22 14:00:00,Montreal,286.55,95.0,0.0,280.342729,71.840087,3.831901
1160943,2016-02-17 15:00:00,Toronto,271.79,74.0,3.0,281.941258,76.281805,3.830741
1055229,2014-05-24 23:00:00,Miami,301.91,67.0,0.0,298.0265,75.509348,3.232349
798267,2016-01-21 11:00:00,Nashville,272.087243,99.0,2.0,288.566327,68.196462,2.42406
1023223,2015-11-27 22:00:00,Charlotte,292.16,52.0,0.0,288.897054,70.190717,2.381654
