In [1]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
rain = pd.read_csv('weatherAUS.csv')
rain.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
rain.shape

(145460, 23)

In [4]:
rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [5]:
rain['Date'] = pd.to_datetime(rain['Date'])
rain['year'] = rain['Date'].dt.year
rain['month'] = rain['Date'].dt.month
rain['day'] = rain['Date'].dt.day

In [6]:
rain.drop('Date',axis=1,inplace=True)

In [7]:
rain.head

<bound method NDFrame.head of        Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0        Albury     13.4     22.9       0.6          NaN       NaN   
1        Albury      7.4     25.1       0.0          NaN       NaN   
2        Albury     12.9     25.7       0.0          NaN       NaN   
3        Albury      9.2     28.0       0.0          NaN       NaN   
4        Albury     17.5     32.3       1.0          NaN       NaN   
...         ...      ...      ...       ...          ...       ...   
145455    Uluru      2.8     23.4       0.0          NaN       NaN   
145456    Uluru      3.6     25.3       0.0          NaN       NaN   
145457    Uluru      5.4     26.9       0.0          NaN       NaN   
145458    Uluru      7.8     27.0       0.0          NaN       NaN   
145459    Uluru     14.9      NaN       0.0          NaN       NaN   

       WindGustDir  WindGustSpeed WindDir9am WindDir3pm  ...  Pressure3pm  \
0                W           44.0          W        

In [8]:
# categorical features:

categorical_features = [feature for feature in rain.columns if rain[feature].dtype =='O']
print(categorical_features)

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


In [9]:
#numerical features:

numerical_features = [feature for feature in rain.columns if rain[feature].dtype != 'O']
print(numerical_features)

['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'year', 'month', 'day']


In [10]:
rain[categorical_features].isnull().sum()

Location            0
WindGustDir     10326
WindDir9am      10566
WindDir3pm       4228
RainToday        3261
RainTomorrow     3267
dtype: int64

In [11]:
categorical_features_withnull = [feature for feature in categorical_features if rain[feature].isnull().sum()]
for each_feature in categorical_features_withnull:
    model = rain[each_feature].mode()[0]
    rain[each_feature].fillna(model,inplace=True)

In [12]:
#checking for null values in numerical data
rain[numerical_features].isnull().sum()

MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustSpeed    10263
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
year                 0
month                0
day                  0
dtype: int64

In [13]:
features_with_outliers = [feature for feature in rain[numerical_features] if rain[feature].isnull().sum() !=0]

In [14]:
for each_feature in features_with_outliers:
    q1 = rain[each_feature].quantile(0.25)
    q3 = rain[each_feature].quantile(0.75)
    IQR = q3-q1
    lower_limit = q1-IQR*1.5
    upper_limit = q3-IQR*1.5
    rain.loc[rain[each_feature]<lower_limit,each_feature] = lower_limit
    rain.loc[rain[each_feature]>upper_limit,each_feature] = upper_limit

In [15]:
for feature in features_with_outliers:
    model = rain[feature].mean()
    rain[feature].fillna(model,inplace=True)

In [16]:
rain['RainToday'].replace({'No':0,'Yes':1},inplace=True)
rain['RainTomorrow'].replace({'No':0,'Yes':1},inplace=True)


In [21]:
def encode_data(feature_name):
    mapping_dict={}
    unique_values = list(rain[feature_name].unique())
    for idx in range(len(unique_values)):
        mapping_dict[unique_values[idx]] = idx

    return mapping_dict

In [22]:
rain['WindGustDir'].replace(encode_data('WindGustDir'),inplace = True)

In [23]:
rain['WindDir9am'].replace(encode_data('WindDir9am'),inplace = True)

In [25]:
rain['WindDir3pm'].replace(encode_data('WindDir3pm'),inplace = True)

In [26]:
rain['Location'].replace(encode_data('Location'), inplace = True)

In [None]:
X = rain.drop(['RainTomorrow'],axis=1)
y = rain['RainTomorrow']

In [29]:
# finding which features are important

from sklearn.ensemble import ExtraTreesRegressor
etr_model = ExtraTreesRegressor()
etr_model.fit(X,y)
etr_model.feature_importances_

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X,y)

In [None]:
from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)