In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('weather/daily_weather.csv')

In [3]:
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [7]:
data.shape

(1095, 11)

In [8]:
data[data.isnull().any(axis=1)].shape

(31, 11)

In [9]:
del data['number']

In [10]:
data.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [11]:
data = data.dropna()

In [12]:
data.shape

(1064, 10)

## Convert to a Classification Task

In [13]:
clean_data = data.copy()
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm'] > 24.99)*1

In [14]:
clean_data.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm,high_humidity_label
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16,1
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597,0
2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46,0
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547,0
4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74,1


### We are storing the high_humidity_label in variable y

In [27]:
y = clean_data[['high_humidity_label']].copy()
#y

In [21]:
morning_features = clean_data.columns.tolist()
morning_features = morning_features[:-3]
morning_features

['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am']

In [22]:
X = clean_data[morning_features].copy()

In [23]:
X.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')

In [30]:
y.columns
#type(y)

Index(['high_humidity_label'], dtype='object')

## Perform Test and Train Split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

In [34]:
X_train.head()

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am
841,918.37,72.932,184.5,2.013246,186.7,2.773806,0.0,0.0
75,920.1,53.492,186.1,13.444009,193.8,15.367778,0.0,0.0
95,927.61,54.896,55.0,4.988376,53.4,7.202947,0.0,0.0
895,919.235153,65.951112,194.343333,2.942019,216.569792,3.65881,0.0,0.0
699,919.888128,68.687822,228.51773,3.960858,247.954028,5.185547,0.0,0.0


In [35]:
y_train.head()

Unnamed: 0,high_humidity_label
841,0
75,1
95,0
895,0
699,0


In [36]:
y_train.describe()

Unnamed: 0,high_humidity_label
count,712.0
mean,0.494382
std,0.50032
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## Fit on train classifier

In [37]:
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
humidity_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

## Predict on test set

In [38]:
predictions = humidity_classifier.predict(X_test)

In [39]:
predictions[:10]

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [40]:
y_test[:10]

Unnamed: 0,high_humidity_label
456,0
845,0
693,1
259,1
723,1
224,1
300,1
442,0
585,1
1057,1


## Measure Accuracy for Classifier

In [41]:
accuracy_score(y_true = y_test, y_pred = predictions)

0.8153409090909091