In [342]:
# Source of dataset : https://www.kaggle.com/jsphyg/weather-dataset-rattle-package

<h1 style="color: purple">Predicting Rainfall for the Next Day</h1>

<p style = "font-size:18px"> This notebooks predicts if it is going to rain tomorrow based on weather conditions of today, using Decision Tree Classifier. <p>

In [371]:
# importing all necessary libraries
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [372]:
data = pd.read_csv('./weatherAUS.csv') # reading the csv file containing data for weather conditions

In [373]:
data.columns # features in the dataset

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')

In [374]:
data[data.isnull().any(axis=1)] # check for NaN values

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
5,2008-12-06,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,23.0,1009.2,1005.4,,,20.6,28.9,No,0.0,No
6,2008-12-07,Albury,14.3,25.0,0.0,,,W,50.0,SW,...,19.0,1009.6,1008.2,1.0,,18.1,24.6,No,0.0,No
7,2008-12-08,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,19.0,1013.4,1010.1,,,16.3,25.5,No,0.0,No
8,2008-12-09,Albury,9.7,31.9,0.0,,,NNW,80.0,SE,...,9.0,1008.9,1003.6,,,18.3,30.2,No,1.4,Yes
9,2008-12-10,Albury,13.1,30.1,1.4,,,W,28.0,S,...,27.0,1007.0,1005.7,,,20.1,28.2,Yes,0.0,No


<p style = "font-size: 18px"> <b>Date</b> and <b>location</b> are deleted since they are not helpful in predicting rainfall for tomorrow </p>

In [375]:
del data['Date']
del data['Location']

In [376]:
data.shape[0] # Number of rows before dropping rows with null values

142193

In [377]:
data = data.dropna() # Dropping rows with null value 

In [378]:
data.shape[0] # Number of rows after dropping rows with null values

56420

In [379]:
clean_data = data.copy()
clean_data['RainTomorrow'] = (clean_data['RainTomorrow'] == 'Yes')*1 # Changing 'Yes' to 1
clean_data['RainToday'] = (clean_data['RainToday'] == 'Yes')*1 # Changing 'Yes' to 1

In [380]:
print(clean_data['RainTomorrow'][:5]) # First five values for Rain Tomorrow

5939    0
5940    0
5942    0
5943    0
5944    0
Name: RainTomorrow, dtype: int32


In [387]:
y = clean_data[['RainTomorrow']].copy() # copying values in rain tomorrow to variable y

In [388]:
y.head() # First five values in y

Unnamed: 0,RainTomorrow
5939,0
5940,0
5942,0
5943,0
5944,0


In [389]:
features = ['WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm',
       'Cloud9am', 'Cloud3pm'] # features to be used to predict rainfall for tomorrow

In [390]:
X = clean_data[features].copy() # copying values in features to variable X

In [391]:
X.columns # printing names of features in X

Index(['WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm'],
      dtype='object')

In [392]:
y.columns

Index(['RainTomorrow'], dtype='object')

In [393]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=324) # splitting data for training and testing

In [424]:
rainfall_classifier = DecisionTreeClassifier(max_leaf_nodes=50, random_state=0) # Using Decision Tree Classifier
rainfall_classifier.fit(X_train, y_train) #Training the model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=50,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [425]:
predictions = rainfall_classifier.predict(X_test) # Predicting the model

In [426]:
predictions[:10] # Printing last 10 values predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [422]:
y_test['RainTomorrow'][:10]  # Printing actual last 10 values

74641     0
32517     0
35264     0
120570    0
119334    0
33065     0
117181    0
65181     0
118796    1
82234     1
Name: RainTomorrow, dtype: int32

In [423]:
accuracy_score(y_true = y_test, y_pred = predictions) # Checking accuracy of the model

0.8453267162944582