In [None]:
import pickle
import pandas as pd
import csv
import re
from pandas import read_csv
import datetime
import numpy as np
import xgboost as xgb

##For Analysis
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.formula.api import ols
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import shap

##
## ===> Visualization <===
##
import seaborn as sns
sns.set()
sns.set_style('whitegrid')
import matplotlib.pyplot as plt

pd.set_option('display.width',1000)
pd.set_option('display.max_columns',300)
pd.set_option('display.max_rows',1000)
%matplotlib inline

In [None]:
#Dataset taken from: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package

In [None]:
#Importing Weather Data for Australia
weather_in_aus = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

print(weather_in_aus.info())
weather_in_aus.head(100)

In [None]:
#It would be interesting to try and predict raintomorrow with all columns. Would also be interesting to add a column titled 'Rained on exact data in previous year (Yes or No)'

In [None]:
#Dropping the Rainfall Column due to being identified as having a high percentile of zero values.
weather_in_aus.drop(['Rainfall'], axis=1, inplace=True)

In [None]:
#Removing all NA Rows from dataset.
weather_in_aus_filtered = weather_in_aus.dropna()
weather_in_aus_filtered.head(50)

In [None]:
#Converting Dateto date-time format
weather_in_aus_filtered.loc[:,'Date']= pd.to_datetime(weather_in_aus_filtered['Date'], infer_datetime_format=True) #Year-month-day.
weather_in_aus_filtered.info()

In [None]:
##Binary encoding 'RainToday' and 'RainTomorrow'

#For RainToday:
weather_in_aus_filtered.loc[:,'raintoday_encoded'] =  0

#Assigning identifier (1) if status is 'Yes'
weather_in_aus_filtered.loc[weather_in_aus_filtered['RainToday']=='Yes','raintoday_encoded']=1

#For RainTomorrow:
weather_in_aus_filtered.loc[:,'raintomorrow_encoded'] =  0

#Assigning identifier (1) if status is 'Yes'
weather_in_aus_filtered.loc[weather_in_aus_filtered['RainTomorrow']=='Yes','raintomorrow_encoded']=1

In [None]:
#Determining number of unique categorical observations remaining for categorical variables.
print(weather_in_aus_filtered['Location'].nunique())
print(weather_in_aus_filtered['WindGustDir'].nunique())
print(weather_in_aus_filtered['WindDir9am'].nunique())
print(weather_in_aus_filtered['WindDir3pm'].nunique())

In [None]:
#Adding a column for location region.

#Listing Locations Remaining
weather_in_aus_filtered['Location'].unique()

#Importing Region file for join
aus_location_region = pd.read_csv('../input/aus-location-region/aus_location_region.csv')

#Joining with main file
weather_in_aus_filtered = pd.merge(weather_in_aus_filtered,aus_location_region, how="left", on=["Location"])

In [None]:
#Creating new columns to identify the year and month of the observation.
weather_in_aus_filtered['weather_reading_year'] = weather_in_aus_filtered['Date'].dt.year

weather_in_aus_filtered['weather_reading_month'] = weather_in_aus_filtered['Date'].dt.month

In [None]:
#Creating Blank Columns for 'Encoding' Wind Direction
new_cols_for_labelencoding = ['WindGustDir_encoded', 'WindDir9am_encoded', 'WindDir3pm_encoded']
weather_in_aus_filtered[new_cols_for_labelencoding]=0

In [None]:
#Convert Wind Direction to Degrees: http://snowfence.umn.edu/Components/winddirectionanddegrees.htm (values for each direct of the 16-point compass are the midrange of each degree range shown in this link)

#For WindGustDir
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='N','WindGustDir_encoded']=0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='NNE','WindGustDir_encoded']=22.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='NE','WindGustDir_encoded']=45.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='ENE','WindGustDir_encoded']=67.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='E','WindGustDir_encoded']=90.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='ESE','WindGustDir_encoded']=112.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='SE','WindGustDir_encoded']=135.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='SSE','WindGustDir_encoded']=157.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='S','WindGustDir_encoded']=180.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='SSW','WindGustDir_encoded']=202.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='SW','WindGustDir_encoded']=225.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='WSW','WindGustDir_encoded']=247.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='W','WindGustDir_encoded']=270.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='WNW','WindGustDir_encoded']=292.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='NW','WindGustDir_encoded']=315.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindGustDir']=='NNW','WindGustDir_encoded']=337.5

#For WindDir9am
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='N','WindDir9am_encoded']=0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='NNE','WindDir9am_encoded']=22.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='NE','WindDir9am_encoded']=45.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='ENE','WindDir9am_encoded']=67.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='E','WindDir9am_encoded']=90.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='ESE','WindDir9am_encoded']=112.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='SE','WindDir9am_encoded']=135.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='SSE','WindDir9am_encoded']=157.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='S','WindDir9am_encoded']=180.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='SSW','WindDir9am_encoded']=202.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='SW','WindDir9am_encoded']=225.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='WSW','WindDir9am_encoded']=247.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='W','WindDir9am_encoded']=270.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='WNW','WindDir9am_encoded']=292.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='NW','WindDir9am_encoded']=315.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir9am']=='NNW','WindDir9am_encoded']=337.5

#For WindDir3pm
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='N','WindDir3pm_encoded']=0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='NNE','WindDir3pm_encoded']=22.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='NE','WindDir3pm_encoded']=45.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='ENE','WindDir3pm_encoded']=67.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='E','WindDir3pm_encoded']=90.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='ESE','WindDir3pm_encoded']=112.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='SE','WindDir3pm_encoded']=135.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='SSE','WindDir3pm_encoded']=157.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='S','WindDir3pm_encoded']=180.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='SSW','WindDir3pm_encoded']=202.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='SW','WindDir3pm_encoded']=225.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='WSW','WindDir3pm_encoded']=247.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='W','WindDir3pm_encoded']=270.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='WNW','WindDir3pm_encoded']=292.5
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='NW','WindDir3pm_encoded']=315.0
weather_in_aus_filtered.loc[weather_in_aus_filtered['WindDir3pm']=='NNW','WindDir3pm_encoded']=337.5

In [None]:
#Creating binary encoding columns for humidity. Is relative humidity less than 30%?
#For Humidity9am:
weather_in_aus_filtered.loc[:,'Humidity9am_encoded'] =  0

#Assigning identifier (1) if status is 'Yes'
weather_in_aus_filtered.loc[weather_in_aus_filtered['Humidity9am']<30,'Humidity9am_encoded']=1

#For Humidity3pm:
weather_in_aus_filtered.loc[:,'Humidity3pm_encoded'] =  0

#Assigning identifier (1) if status is 'Yes'
weather_in_aus_filtered.loc[weather_in_aus_filtered['Humidity3pm']<30,'Humidity3pm_encoded']=1

In [None]:
weather_in_aus_filtered.head()

In [None]:
#Cleaned Dataset
weather_in_aus_filtered.to_csv('weatherAUS_cleaned.csv', index=False, header=True)