In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
from cycler import cycler
import matplotlib as mpl

raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0)  # Yellow
]
raw_dark_palette = [
    (10, 132, 255), # Blue
    (255, 159, 10), # Orange
    (48, 209, 88),  # Green
    (255, 69, 58),  # Red
    (191, 90, 242), # Purple
    (94, 92, 230),  # Indigo
    (255, 55, 95),  # Pink
    (100, 210, 255),# Teal
    (255, 214, 10)  # Yellow
]

raw_gray_light_palette = [
    (142, 142, 147),# Gray
    (174, 174, 178),# Gray (2)
    (199, 199, 204),# Gray (3)
    (209, 209, 214),# Gray (4)
    (229, 229, 234),# Gray (5)
    (242, 242, 247),# Gray (6)
]
raw_gray_dark_palette = [
    (142, 142, 147),# Gray
    (99, 99, 102),  # Gray (2)
    (72, 72, 74),   # Gray (3)
    (58, 58, 60),   # Gray (4)
    (44, 44, 46),   # Gray (5)
    (28, 28, 39),   # Gray (6)
]

light_palette = np.array(raw_light_palette)/255
dark_palette = np.array(raw_dark_palette)/255
gray_light_palette = np.array(raw_gray_light_palette)/255
gray_dark_palette = np.array(raw_gray_dark_palette)/255

mpl.rcParams['axes.prop_cycle'] = cycler('color',dark_palette)
mpl.rcParams['figure.facecolor']  = gray_dark_palette[-2]
mpl.rcParams['figure.edgecolor']  = gray_dark_palette[-2]
mpl.rcParams['axes.facecolor'] =  gray_dark_palette[-2]

white_color = gray_light_palette[-2]
mpl.rcParams['text.color'] = white_color
mpl.rcParams['axes.labelcolor'] = white_color
mpl.rcParams['axes.edgecolor'] = white_color
mpl.rcParams['xtick.color'] = white_color
mpl.rcParams['ytick.color'] = white_color

mpl.rcParams['figure.dpi'] = 200

mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

In [None]:
# Reading Data
data = pd.read_csv(r'../input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.nunique()

#### Data cleaning

In [None]:
data.isnull().sum()

droping features containing 50,000+ missing values.

In [None]:
data.drop(['Evaporation','Sunshine','Cloud9am','Cloud3pm'],axis=1,inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data['MinTemp'].fillna(data['MinTemp'].mean(),inplace=True)
data['MaxTemp'].fillna(data['MaxTemp'].mean(),inplace=True)
data['Rainfall'].fillna(data['Rainfall'].mean(),inplace=True)
data['WindGustSpeed'].fillna(data['WindGustSpeed'].mean(),inplace=True)
data['WindSpeed9am'].fillna(data['WindSpeed9am'].mean(),inplace=True)
data['WindSpeed3pm'].fillna(data['WindSpeed3pm'].mean(),inplace=True)
data['Humidity9am'].fillna(data['Humidity9am'].mean(),inplace=True)
data['Humidity3pm'].fillna(data['Humidity3pm'].mean(),inplace=True)
data['Pressure9am'].fillna(data['Pressure9am'].mean(),inplace=True)
data['Pressure3pm'].fillna(data['Pressure3pm'].mean(),inplace=True)
data['Temp9am'].fillna(data['Temp9am'].mean(),inplace=True)
data['Temp3pm'].fillna(data['Temp3pm'].mean(),inplace=True)

In [None]:
plt.figure(figsize=(18,9))
plt.subplot(131)
sns.countplot(x = 'WindGustDir',data=data)
plt.xticks(rotation=45)

plt.subplot(132)
sns.countplot(x = 'WindDir9am',data=data)
plt.xticks(rotation=45)

plt.subplot(133)
sns.countplot(x = 'WindDir3pm',data=data)
plt.xticks(rotation=45)

plt.show()

In [None]:
print(data['WindGustDir'].mode(), data['WindDir9am'].mode(), data['WindDir3pm'].mode())

In [None]:
data['WindGustDir'].replace(np.nan,  'W',inplace=True)
data['WindDir9am'].replace(np.nan, 'N',inplace=True)
data['WindDir3pm'].replace(np.nan, 'SE',inplace=True) 

In [None]:
plt.figure(figsize=(14,7))
plt.subplot(121)
sns.countplot(x='RainToday',data=data)

plt.subplot(122)
sns.countplot(x='RainTomorrow',data=data)
plt.show()

In [None]:
data['RainToday'].replace(np.nan,  'Yes',inplace=True)
data['RainTomorrow'].replace(np.nan, 'Yes',inplace=True)

In [None]:
data.isnull().sum()

Converting Date into datetime and then splitting it into Day, Month and Year columns.

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
data['RainToday'] = data['RainToday'].map({'Yes':1,'No':0})
data['RainTomorrow'] = data['RainTomorrow'].map({'Yes':1,'No':0})

## Date Plot

We are going to plot features with datetime. Here, I am going to use date from last 3 years.

Refrence: [https://www.kaggle.com/siddheshera/rain-in-australia-with-eda-h2o-88-4-auc](https://www.kaggle.com/siddheshera/rain-in-australia-with-eda-h2o-88-4-auc)

#### MinTemp and MaxTemp

In [None]:
df_dateplot = data.iloc[-950:,:]
plt.figure(figsize=[20,5])
plt.plot(df_dateplot['Date'],df_dateplot['MinTemp'],color='blue',linewidth=1, label= 'MinTemp')
plt.plot(df_dateplot['Date'],df_dateplot['MaxTemp'],color='red',linewidth=1, label= 'MaxTemp')
plt.fill_between(df_dateplot['Date'],df_dateplot['MinTemp'],df_dateplot['MaxTemp'], facecolor = '#EBF78F')
plt.title('MinTemp vs MaxTemp by Date')
plt.legend(loc='lower left', frameon=False)
plt.show()

* Above plot shows that the MinTemp and MaxTemp relatively increases and decreases every year.
* The weather conditions are always opposite in the two hemispheres. As, the Australia is situated in the southern hemisphere. The seasons are bit different.
* As you can see that, December to February is summer; March to May is autumn; June to August is winter; and September to November is spring.

#### Rainfall

In [None]:
df_dateplot = data.iloc[-950:,:]
plt.figure(figsize=[20,5])
plt.plot(df_dateplot['Date'],df_dateplot['Rainfall'],color='violet', linewidth=2, label= 'Rainfall')
plt.legend(loc='upper left', frameon=False)
plt.title('Rainfall by Date')
plt.show()

* Being situated in southern hemisphere, the majority of rainfall occurs between December and March.
* As you can see from above plot, we can see that Dec-Jan does get a lot of rainfall but there are months like Jun-Jul when rainfall occurs too.

#### WindGustSpeed

In [None]:
df_dateplot = data.iloc[-950:,:]
plt.figure(figsize=[20,5])
plt.plot(df_dateplot['Date'],df_dateplot['WindGustSpeed'],color='violet', linewidth=2, label= 'WindGustSpeed')
plt.legend(loc='upper left', frameon=False)
plt.title('WindGustSpeed by Date')
plt.show()

* In Australia, wind speed is usually moderate. But, from above plot we can see that Dec-Feb is the windiest months.

#### WindSpeed9am and WindSpeed3pm 

In [None]:
df_dateplot = data.iloc[-950:,:]

plt.figure(figsize=[20,10])
plt.subplot(211)
plt.plot(df_dateplot['Date'],df_dateplot['WindSpeed9am'],color='blue', linewidth=2, label= 'WindSpeed9am')
plt.legend(loc='upper left', frameon=False)
plt.title('WindSpeed9am by Date')

plt.subplot(212)
plt.plot(df_dateplot['Date'],df_dateplot['WindSpeed3pm'],color='green', linewidth=2, label= 'WindSpeed3pm')
plt.legend(loc='upper left', frameon=False)
plt.title('WindSpeed3pm by Date')
plt.show()

* WindSpeed9am and WindSpeed3pm are relatively same around certain months.

#### Humidity9am and Humidity3pm

In [None]:
df_dateplot = data.iloc[-950:,:]
plt.figure(figsize=[20,5])
plt.plot(df_dateplot['Date'],df_dateplot['Humidity9am'],color='violet', linewidth=2, label= 'Humidity9am')
plt.plot(df_dateplot['Date'],df_dateplot['Humidity3pm'],color='green', linewidth=2, label= 'Humidity3pm')
plt.legend(loc='upper left', frameon=False)
plt.title('Humidity9am vs Humidity3pm by Date')
plt.show()

* From above plot we can see that the Humidity is high around Jun-Jul and also during that time, there is good difference between humidity around 9am and 3pm.

#### Pressure9am and Pressure3am 

In [None]:
df_dateplot = data.iloc[-950:,:]
plt.figure(figsize=[20,5])
plt.plot(df_dateplot['Date'],df_dateplot['Pressure9am'],color='violet', linewidth=2, label= 'Pressure9am')
plt.plot(df_dateplot['Date'],df_dateplot['Pressure3pm'],color='green', linewidth=2, label= 'Pressure3pm')
plt.legend(loc='upper left', frameon=False)
plt.title('Pressure9am vs Pressure3pm by Date')
plt.show()

* Pressure is high around the months of Jun-Aug and around Dec-Jan you can see that the pressure is low.
* In a low pressure area the rising air cools and this is likely to condense water vapour and form clouds, and consequently rain.

#### Temp9am and Temp3pm

In [None]:
df_dateplot = data.iloc[-950:,:]
plt.figure(figsize=[20,5])
plt.plot(df_dateplot['Date'],df_dateplot['Temp9am'],color='blue', linewidth=2, label= 'Temp9am')
plt.plot(df_dateplot['Date'],df_dateplot['Temp3pm'],color='red', linewidth=2, label= 'Temp3pm')
plt.fill_between(df_dateplot['Date'],df_dateplot['Temp9am'],df_dateplot['Temp3pm'], facecolor = '#EBF78F')
plt.legend(loc='lower left', frameon=False)
plt.title('Temp9am vs Temp3pm by Date')
plt.show()

*  As I mentioned in the above plots, that Dec-Jan are months when the temperature is high but these are the months when the difference between temperature around 9am and 3pm is less as compare to the months of Jun-Aug when the difference is high.

In [None]:
data.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_mod = ['Date','Location','WindGustDir','WindDir9am','WindDir3pm']
for i in var_mod:
    data[i] = le.fit_transform(data[i])

In [None]:
X = data.drop('RainTomorrow',axis=1)
y = data['RainTomorrow']

* Splits data into train and test sets using **train_test_split**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33)

#### Using Pearson Correlation

In [None]:
plt.figure(figsize=(14,14))
corr = X_train.corr()
sns.heatmap(corr, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X_train,0.7)
len(set(corr_features))

In [None]:
corr_features

In [None]:
X_train.drop(corr_features,axis=1)
X_test.drop(corr_features,axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

### Model training

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=16,
                    n_estimators=1000)
xgb.fit(X_train,y_train)

In [None]:
probs = xgb.predict_proba(X_test)[:,1]
pred = xgb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score , classification_report, roc_auc_score
accuracy = accuracy_score(y_test,pred)
roc_auc = roc_auc_score(y_test,probs)
print("Accuracy = {}".format(accuracy))
print("ROC score = {}".format(roc_auc))
print(classification_report(y_test,pred))