In [16]:
# Loading of packages
# There are new packages to look at in this chapter
# Firstly, the package sklearn, and statsmodels.api
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import random
import statsmodels.api as sm
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import datetime as dt
%matplotlib inline

In [2]:
# Display the data
df = pd.read_csv('2018-weathertest.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,Location,No. of Cases,Date,Coordinates,Closest Location,Daily Rainfall Total (mm),Highest 30 Min Rainfall (mm),Highest 60 Min Rainfall (mm),Highest 120 Min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
0,"Yishun Avenue 7 (Blk 172), Singapore",2,02-01-18,"1.4368965, 103.8317924",Yishun,9.6,7,8.6,8.8,,,,,
1,"Yishun Avenue 7 (Blk 173), Singapore",2,02-01-18,"1.4370075, 103.8309346",Yishun,9.6,7,8.6,8.8,,,,,
2,"Yishun Avenue 7 (Blk 174), Singapore",1,02-01-18,"1.4374434, 103.8318144",Yishun,9.6,7,8.6,8.8,,,,,
3,"Yishun Ring Road (Blk 166), Singapore",4,02-01-18,"1.4363075, 103.8316116",Yishun,9.6,7,8.6,8.8,,,,,
4,"Canberra Crescent (Blk 130B), Singapore",1,02-01-18,"1.4443594, 103.8320871",Yishun,9.6,7,8.6,8.8,,,,,


In [3]:
# Remove redundant columns and rename remaining columns
df_coords = df['Coordinates'].str.split(", ",expand=True,) #
drop_columns = df.iloc[:,6:].columns.delete(3).tolist() + ['Location','Coordinates']
df = df.drop(columns=drop_columns)
df[['First','Last']] = df_coords #

#columns = ['Cases','Closest Location','Rainfall','Temperature']
columns = ['Cases','Date','Closest Location','Rainfall','Temperature','Latitude','Longtitude']
df.columns = columns

In [4]:
# Remove any rows with NaN
df['Temperature'] = pd.to_numeric(df['Temperature'], errors='coerce')
#df.fillna(df['Temperature'].mean(),inplace=True)

df['Rainfall'] = pd.to_numeric(df['Rainfall'], errors='coerce')
#df.fillna(0,inplace=True)

df['Cases'] = pd.to_numeric(df['Cases'], errors='coerce')
df = df[pd.isnull(df).any(axis=1)==False]

In [5]:
# Dictionary for encoding stations
stationList = [
['S46','Upper Thomson'],
['S50','Clementi'],
['S55','Buangkok'],
['S60','Sentosa Island'],
['S61','Chai Chee'],
['S63','Boon Lay (West)'],
['S64','Bukit Panjang'],
['S66','Kranji Reservoir'],
['S69','Upper Peirce Reservoir'],
['S71','Kent Ridge'],
['S77','Queenstown'],
['S78','Tanjong Katong'],
['S79','Somerset (Road)'],
['S81','Punggol'],
['S82','Tuas West'],
['S84','Simei'],
['S86','Boon Lay (East)'],
['S88','Toa Payoh'],
['S89','Tuas'],
['S90','Bukit Timah'],
['S91','Yishun'],
['S92','Buona Vista'],
['S94','Pasir Ris (Central)'],
['S101','Jurong (North)'],
['S102','Semakau Island'],
['S104','Admiralty'],
['S105','Admiralty West'],
['S106','Pulau Ubin'],
['S107','East Coast Parkway'],
['S108','Marina Barrage'],
['S109','Ang Mo Kio'],
['S110','Serangoon North'],
['S111','Newton'],
['S112','Lim Chu Kang'],
['S113','Marine Parade'],
['S114','Choa Chu Kang (Central)'],
['S115','Tuas South'],
['S116','Pasir Panjang'],
['S117','Jurong Island'],
['S118','Dhoby Ghaut'],
['S119','Nicoll Highway'],
['S120','Botanic Garden'],
['S121','Choa Chu Kang (South)'],
['S122','Khatib'],
['S123','Whampoa']
]
station_dict = {}
for station in stationList:
    station_dict[station[1]] = stationList.index(station)

In [6]:
df.head()

Unnamed: 0,Cases,Date,Closest Location,Rainfall,Temperature,Latitude,Longtitude
39,1.0,05-01-18,Admiralty,5.0,25.9,1.4348177,103.7986944
40,1.0,05-01-18,Admiralty,5.0,25.9,1.4370026,103.7992125
41,1.0,05-01-18,Admiralty,5.0,25.9,1.4363504,103.7978542
45,1.0,05-01-18,Admiralty,5.0,25.9,1.4336298,103.7986691
46,1.0,05-01-18,Admiralty,5.0,25.9,1.4341688,103.8005255


In [7]:
# Encode stations
df['Closest Location'] = df['Closest Location'].str.rstrip()
df['Closest Location'] = df['Closest Location'].replace(station_dict)

# Encode dates
df['Date'] = pd.to_datetime(df['Date'])
df['Date']= df['Date'].map(dt.datetime.toordinal)

In [8]:
# Split data into training and testing sets
X = df.drop('Cases',1)
y = df['Cases']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=44)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 448 entries, 39 to 2613
Data columns (total 7 columns):
Cases               448 non-null float64
Date                448 non-null int64
Closest Location    448 non-null object
Rainfall            448 non-null float64
Temperature         448 non-null float64
Latitude            448 non-null object
Longtitude          448 non-null object
dtypes: float64(3), int64(1), object(3)
memory usage: 28.0+ KB


In [9]:
# Decision Tree
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train,y_train)
tree_clf.score(X_test,y_test)

0.6962962962962963

In [10]:
# Display 2019 data
df_2019 = pd.read_csv('2019-weathertest.csv', encoding='ISO-8859-1')
df_2019.head()

Unnamed: 0,Location,No. of Cases,Date,Latitude,Longtitude,Closest Location,Daily Rainfall Total (mm),Highest 30 Min Rainfall (mm),Highest 60 Min Rainfall (mm),Highest 120 Min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
0,Bedok Reservoir Road (Blk 122),1,04/01/2019,1.332052,103.910638,Chai Chee,0,0,0,0,,,,,
1,Bedok Reservoir Road (Blk 124),1,04/01/2019,1.332714,103.910094,Chai Chee,0,0,0,0,,,,,
2,Bedok Reservoir Road (Blk 128),3,04/01/2019,1.333438,103.911454,Chai Chee,0,0,0,0,,,,,
3,Bedok Reservoir Road (Blk 130),2,04/01/2019,1.333506,103.91236,Chai Chee,0,0,0,0,,,,,
4,Bedok Reservoir Road (Blk 131),3,04/01/2019,1.333145,103.911544,Chai Chee,0,0,0,0,,,,,


In [11]:
# Remove redundant columns and rename remaining columns
drop_columns = df_2019.iloc[:,7:].columns.delete(3).tolist() + ['Location']
df_2019 = df_2019.drop(columns=drop_columns)

columns = ['Cases','Date','Latitude','Longtitude','Closest Location','Rainfall','Temperature']
df_2019.columns = columns

In [12]:
# Remove any rows with NaN
df_2019['Temperature'] = pd.to_numeric(df_2019['Temperature'], errors='coerce')
#df.fillna(df['Temperature'].mean(),inplace=True)

df_2019['Rainfall'] = pd.to_numeric(df_2019['Rainfall'], errors='coerce')
#df.fillna(0,inplace=True)

df_2019['Cases'] = pd.to_numeric(df_2019['Cases'], errors='coerce')
df_2019 = df_2019[pd.isnull(df_2019).any(axis=1)==False]

In [13]:
# Encode stations
df_2019['Closest Location'] = df_2019['Closest Location'].replace(station_dict)

# Encode dates
df_2019['Date'] = pd.to_datetime(df_2019['Date'])
df_2019['Date']= df_2019['Date'].map(dt.datetime.toordinal)

In [14]:
# Split data into training and testing sets
#aggregation_functions = {'Cases':'sum','Rainfall':'mean','Temperature':'mean'}
#df_new = df_2019.groupby(['Date','Closest Location'], as_index=False).aggregate(aggregation_functions)

X_2019 = df_2019.drop('Cases',1)
y_2019 = df_2019['Cases']

X_2019_train, X_2019_test, y_2019_train, y_2019_test = train_test_split(X_2019, y_2019, test_size=0.3, random_state=44)
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2217 entries, 70 to 8257
Data columns (total 7 columns):
Cases               2217 non-null float64
Date                2217 non-null int64
Latitude            2217 non-null float64
Longtitude          2217 non-null float64
Closest Location    2217 non-null object
Rainfall            2217 non-null float64
Temperature         2217 non-null float64
dtypes: float64(5), int64(1), object(1)
memory usage: 138.6+ KB


In [15]:
# Decision Tree
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_2019_train,y_2019_train)
tree_clf.score(X_2019_test,y_2019_test)

aggregation_functions = {'Cases':'sum','Rainfall':'mean','Temperature':'mean'}
df_new = df_2019.groupby(['Date','Closest Location'], as_index=False).aggregate(aggregation_functions)

sqrt(mean_squared_error(tree_clf.predict(X_2019_test), y_2019_test))

3.673621577611866