### Import libraries





In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# import libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
%matplotlib inline



### Import dataset 

In [3]:
data = '../data/weatherAUS.csv'
df = pd.read_csv(data)

In [4]:
df.dropna(subset=['RainTomorrow'], inplace=True)

### Feature Engineering

#### Extract month, day and year from Date variable

In [5]:
df['Date'] = pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year

# extract month from date

df['Month'] = df['Date'].dt.month

# extract day from date

df['Day'] = df['Date'].dt.day

In [6]:
# drop the original Date variable

df.drop('Date', axis=1, inplace = True)

In [7]:
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,...,1007.1,8.0,,16.9,21.8,No,No,2008,12,1
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,...,1007.8,,,17.2,24.3,No,No,2008,12,2
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,...,1008.7,,2.0,21.0,23.2,No,No,2008,12,3
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,...,1012.8,,,18.1,26.5,No,No,2008,12,4
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,...,1006.0,7.0,8.0,17.8,29.7,No,No,2008,12,5


#### Detect outliers range for numerical variable

In [8]:
# find outliers for Rainfall variable

IQR = df.Rainfall.quantile(0.75) - df.Rainfall.quantile(0.25)
Lower_fence = df.Rainfall.quantile(0.25) - (IQR * 3)
Upper_fence = df.Rainfall.quantile(0.75) + (IQR * 3)
print('Rainfall outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Rainfall outliers are values < -2.4000000000000004 or > 3.2


In [9]:
# find outliers for Evaporation variable

IQR = df.Evaporation.quantile(0.75) - df.Evaporation.quantile(0.25)
Lower_fence = df.Evaporation.quantile(0.25) - (IQR * 3)
Upper_fence = df.Evaporation.quantile(0.75) + (IQR * 3)
print('Evaporation outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Evaporation outliers are values < -11.800000000000002 or > 21.800000000000004


In [10]:
# find outliers for WindSpeed9am variable

IQR = df.WindSpeed9am.quantile(0.75) - df.WindSpeed9am.quantile(0.25)
Lower_fence = df.WindSpeed9am.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindSpeed9am.quantile(0.75) + (IQR * 3)
print('WindSpeed9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

# find outliers for WindSpeed3pm variable

IQR = df.WindSpeed3pm.quantile(0.75) - df.WindSpeed3pm.quantile(0.25)
Lower_fence = df.WindSpeed3pm.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindSpeed3pm.quantile(0.75) + (IQR * 3)
print('WindSpeed3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))


WindSpeed9am outliers are values < -29.0 or > 55.0
WindSpeed3pm outliers are values < -20.0 or > 57.0


#### Declare feature vector and target variable and split data into train and test set

In [11]:
X = df.drop(['RainTomorrow'], axis=1)

y = df['RainTomorrow']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [13]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((113754, 24), (28439, 24))

In [14]:
y.value_counts(normalize=True)

No     0.775819
Yes    0.224181
Name: RainTomorrow, dtype: float64

#### Get categorical and numerical columns

In [15]:
categorical = [col for col in X.columns if X[col].dtypes == 'O']

In [16]:
categorical

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [17]:
numerical = [col for col in X.columns if X[col].dtypes != 'O']


In [18]:
numerical

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'Year',
 'Month',
 'Day']

#### Engineering missing values in numerical variables

In [19]:
# check missing values in numerical variables in X_train

X_train[numerical].isnull().sum()

MinTemp            495
MaxTemp            264
Rainfall          1139
Evaporation      48718
Sunshine         54314
WindGustSpeed     7367
WindSpeed9am      1086
WindSpeed3pm      2094
Humidity9am       1449
Humidity3pm       2890
Pressure9am      11212
Pressure3pm      11186
Cloud9am         43137
Cloud3pm         45768
Temp9am            740
Temp3pm           2171
Year                 0
Month                0
Day                  0
dtype: int64

In [None]:
# check missing values in numerical variables in X_test

X_test[numerical].isnull().sum()

MinTemp            142
MaxTemp             58
Rainfall           267
Evaporation      12125
Sunshine         13502
WindGustSpeed     1903
WindSpeed9am       262
WindSpeed3pm       536
Humidity9am        325
Humidity3pm        720
Pressure9am       2802
Pressure3pm       2795
Cloud9am         10520
Cloud3pm         11326
Temp9am            164
Temp3pm            555
Year                 0
Month                0
Day                  0
dtype: int64

In [20]:
# impute missing values in X_train and X_test with respective column median in X_train

for df1 in [X_train, X_test]:
    for col in numerical:
        col_median=X_train[col].median()
        df1[col].fillna(col_median, inplace=True)  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [21]:
# check again missing values in numerical variables in X_train

X_train[numerical].isnull().sum()


MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
Year             0
Month            0
Day              0
dtype: int64

In [22]:
# check missing values in numerical variables in X_test

X_test[numerical].isnull().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
Year             0
Month            0
Day              0
dtype: int64

#### Engineering missing values in categorical variables

In [23]:

X_train[categorical].isnull().sum()

Location          0
WindGustDir    7407
WindDir9am     7978
WindDir3pm     3008
RainToday      1139
dtype: int64

In [24]:
# impute missing categorical variables with most frequent value

for df2 in [X_train, X_test]:
    df2.loc[:]['WindGustDir'].fillna(X_train['WindGustDir'].mode()[0], inplace=True)
    df2.loc[:]['WindDir9am'].fillna(X_train['WindDir9am'].mode()[0], inplace=True)
    df2.loc[:]['WindDir3pm'].fillna(X_train['WindDir3pm'].mode()[0], inplace=True)
    df2.loc[:]['RainToday'].fillna(X_train['RainToday'].mode()[0], inplace=True)

In [25]:
# check missing values in categorical variables in X_train

X_train[categorical].isnull().sum()

Location       0
WindGustDir    0
WindDir9am     0
WindDir3pm     0
RainToday      0
dtype: int64

In [26]:
# check missing values in categorical variables in X_test

X_test[categorical].isnull().sum()

Location       0
WindGustDir    0
WindDir9am     0
WindDir3pm     0
RainToday      0
dtype: int64

#### Engineering outliers in numerical variable

In [27]:
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])

for df3 in [X_train, X_test]:
    df3.loc[:]['Rainfall'] = max_value(df3, 'Rainfall', 3.2)
    df3.loc[:]['Evaporation'] = max_value(df3, 'Evaporation', 21.8)
    df3.loc[:]['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
    df3.loc[:]['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)

In [28]:
X_train.Rainfall.max(), X_test.Rainfall.max()

(3.2, 3.2)

In [29]:
X_train.Evaporation.max(), X_test.Evaporation.max()

(21.8, 21.8)

In [30]:
X_train.WindSpeed9am.max(), X_test.WindSpeed9am.max()

(55.0, 55.0)

In [31]:
X_train.WindSpeed3pm.max(), X_test.WindSpeed3pm.max()

(57.0, 57.0)

#### Encode categorical variables

In [32]:
# encode RainToday variable

import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['RainToday'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)



  elif pd.api.types.is_categorical(cols):


In [33]:
X_train.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday_0,RainToday_1,Year,Month,Day
113462,Witchcliffe,13.9,22.6,0.2,4.8,8.5,S,41.0,SSE,S,...,1013.4,5.0,5.0,18.8,20.4,0,1,2014,4,25
89638,Cairns,22.4,29.4,2.0,6.0,6.3,ENE,33.0,SSE,SE,...,1013.1,7.0,5.0,26.4,27.5,1,0,2015,11,2
138130,AliceSprings,9.7,36.2,0.0,11.4,12.3,E,31.0,NE,N,...,1013.6,1.0,1.0,28.5,35.0,0,1,2014,10,19
87898,Cairns,20.5,30.1,0.0,8.8,11.1,ESE,37.0,SSE,E,...,1010.8,2.0,3.0,27.3,29.4,0,1,2010,10,30
16484,Newcastle,16.8,29.2,0.0,4.8,8.5,W,39.0,N,SE,...,1015.2,5.0,8.0,22.2,27.0,0,1,2012,11,8


In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [35]:
# X = pd.concat([X[numerical], X[['RainToday_0', 'RainToday_1']],
#                      pd.get_dummies(X, columns=["Location", "WindGustDir", "WindDir9am", "WindDir3pm"])], 
#                      axis=1)
# X_dummy = pd.get_dummies(X,  columns=["Location"])
X_train = pd.get_dummies(X_train,  columns=["Location", "WindGustDir", "WindDir9am", "WindDir3pm"])
X_test = pd.get_dummies(X_test,  columns=["Location", "WindGustDir", "WindDir9am", "WindDir3pm"])

# X_train = pd.concat([X_train[numerical], X_train[['RainToday_0', 'RainToday_1']],
#                      pd.get_dummies(X_train.Location), 
#                      pd.get_dummies(X_train.WindGustDir),
#                      pd.get_dummies(X_train.WindDir9am),
#                      pd.get_dummies(X_train.WindDir3pm)], axis=1)


In [36]:
X_train.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
113462,13.9,22.6,0.2,4.8,8.5,41.0,20.0,28.0,65.0,55.0,...,0,0,1,0,0,0,0,0,0,0
89638,22.4,29.4,2.0,6.0,6.3,33.0,7.0,19.0,71.0,59.0,...,0,0,0,1,0,0,0,0,0,0
138130,9.7,36.2,0.0,11.4,12.3,31.0,15.0,11.0,6.0,2.0,...,0,0,0,0,0,0,0,0,0,0
87898,20.5,30.1,0.0,8.8,11.1,37.0,22.0,19.0,59.0,53.0,...,0,0,0,0,0,0,0,0,0,0
16484,16.8,29.2,0.0,4.8,8.5,39.0,0.0,7.0,72.0,53.0,...,0,0,0,1,0,0,0,0,0,0


In [37]:
X_test.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
88578,17.4,29.0,0.0,3.6,11.1,33.0,11.0,19.0,63.0,61.0,...,0,0,0,0,0,0,0,0,0,0
59016,6.8,14.4,0.8,0.8,8.5,46.0,17.0,22.0,80.0,55.0,...,0,0,1,0,0,0,0,0,0,0
127049,10.1,15.4,3.2,4.8,8.5,31.0,13.0,9.0,70.0,61.0,...,0,0,0,0,1,0,0,0,0,0
120886,14.4,33.4,0.0,8.0,11.6,41.0,9.0,17.0,40.0,23.0,...,0,0,0,0,0,0,1,0,0,0
136649,6.8,14.3,3.2,0.2,7.3,28.0,15.0,13.0,92.0,47.0,...,0,0,0,0,0,0,0,0,0,0


#### Feature scaling 

In [38]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_transformed = scaler.fit_transform(X_train)

X_test_transformed = scaler.transform(X_test)

In [39]:
X_train_transformed_split, X_val_transformed, y_train_split, y_val = train_test_split(X_train_transformed, y_train, test_size=0.2, random_state = 0)

#### Save input data and labels to csv files

In [40]:
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=X_train.columns)
y_train_df = pd.DataFrame(y_train, columns=['RainTomorrow']).reset_index(drop=True)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=X_test.columns)
y_test_df = pd.DataFrame(y_test, columns=['RainTomorrow']).reset_index(drop=True)


In [41]:
X_train_transformed_df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,0.524941,0.517958,0.0625,0.220183,0.586207,0.271318,0.363636,0.491228,0.65,0.55,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.726841,0.646503,0.625,0.275229,0.434483,0.209302,0.127273,0.333333,0.71,0.59,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.425178,0.775047,0.0,0.522936,0.848276,0.193798,0.272727,0.192982,0.06,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.68171,0.659735,0.0,0.40367,0.765517,0.24031,0.4,0.333333,0.59,0.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.593824,0.642722,0.0,0.220183,0.586207,0.255814,0.0,0.122807,0.72,0.53,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
y_train_df 

Unnamed: 0,RainTomorrow
0,0
1,0
2,0
3,0
4,1
...,...
113749,0
113750,0
113751,0
113752,0


In [51]:
df_train = X_train_transformed_df.join(y_train_df)
df_test = X_test_transformed_df.join(y_test_df)

In [46]:
SAVE_DIR = '../data'

In [52]:
df_train.to_csv(os.path.join(SAVE_DIR, 'train.csv'), index=False)
df_test.to_csv(os.path.join(SAVE_DIR, 'test.csv'), index=False)