<img src='https://www.greenmebrasil.com/wp-content/uploads/2020/10/montanhas-chuva.jpg'>

* <a href="#import">Import</a>

* <a href="#missing">Missing Values</a>

* <a href="#functions">Functions</a>

* <a href="#eda">EDA</a>

* <a href="#h2o">H2O</a>

<a name="import">

# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">Import</p>

In [None]:
import pandas as pd

import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(16,8)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import h2o

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
data.head()

<a name="missing">

# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">Missing Values</p>

In [None]:
data.shape

In [None]:
(data.isnull().sum()/len(data))*100

<p style='font-family: "Lucida Console", "Courier New", monospace;font-size:125%; '>The Evaporation, Sunshine, Cloud9am and Cloud9am columns have too many missing values so they will be deleted.</p>

In [None]:
data.drop(columns=['Evaporation','Sunshine','Cloud9am','Cloud3pm'], inplace=True)

In [None]:
data.head()

<a name="functions">

# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">Funtions</p>

In [None]:
# Fill missing values with the mean
def missing_value_number(data):
    for col in data.select_dtypes(['int','float']):
        data[col] = data[col].fillna(data[col].median())
    return data

# Fill missing values
def missing_values_object(data):
    for col in data.select_dtypes(['object']):
        data[col] = data[col].fillna(method='ffill')
    return data

# Encoding
def encoder(data):
    le = LabelEncoder()
    for col in data.select_dtypes('object'):
        data[col] = le.fit_transform(data[col])
    return data

# Plot
def plt_percente(plot, feature):
    total = len(feature)
    for p in plot.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2 - 0.05
        y = p.get_y() + p.get_height()
        ax.annotate(percentage, (x, y), size = 18)
    plt.show()

In [None]:
data = missing_value_number(data)
data = missing_values_object(data)

<a name="eda">

# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">EDA</p>

In [None]:
data.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                             .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
plt.subplots(figsize=(16,8))
sns.countplot(data=data, x='WindGustDir').set_title('Gust of Wind');

In [None]:
plt.subplots(figsize=(16,8))
sns.countplot(data=data, x='WindDir9am').set_title('Wind Direction 9am');

In [None]:
plt.subplots(figsize=(16,8))
sns.countplot(data=data, x='WindDir3pm').set_title('Wind Direction 3pm');

In [None]:
plt.subplots(figsize=(8,6))
ax = sns.countplot(data=data, x='RainToday');

plt_percente(ax, data.RainToday)

In [None]:
plt.subplots(figsize=(8,6))
ax = sns.countplot(data=data, x='RainTomorrow');

plt_percente(ax, data.RainTomorrow)

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

In [None]:
data = encoder(data)

In [None]:
plt.figure(figsize=(18,15))
sns.heatmap(data.corr(), cmap=plt.cm.Reds, cbar_kws={'shrink': .6}, square=True, 
            annot=True, fmt='.2f', linewidths=.8)
plt.show()

In [None]:
plt.subplots(figsize=(20,30))
col = data.columns[2:17]
lenght = len(col)

for i, j in zip(col, range(lenght)):
    plt.subplot((lenght/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.distplot(data[i], kde=False, fit=stats.norm)
    plt.title(i)

In [None]:
df = data.iloc[:730,:]
plt.subplots(figsize=(18,6))
plt.plot(df['Date'], df['Rainfall'], color='violet');
plt.title('Rainfall by Date');

In [None]:
plt.subplots(figsize=(18,6))
plt.plot(df['Date'], df['WindGustSpeed'], color='violet');
plt.title('WindGustSpeed by Date');

In [None]:
plt.subplots(figsize=(18,6))
plt.plot(df['Date'], df['MinTemp'], color='blue', linewidth=1, label= 'MinTemp')
plt.plot(df['Date'], df['MaxTemp'], color='red',  linewidth=1, label= 'MaxTemp')
plt.fill_between(df['Date'], df['MinTemp'], df['MaxTemp'], facecolor = '#EBF78F')
plt.title('MinTemp vs MaxTemp by Date')
plt.legend(loc='lower left', frameon=False);

In [None]:
plt.subplots(figsize=(18,6))
plt.plot(df['Date'], df['WindSpeed9am'],color='blue', linewidth=2, label= 'WindSpeed9am')
plt.plot(df['Date'], df['WindSpeed3pm'],color='green', linewidth=2, label= 'WindSpeed3pm')
plt.legend(frameon=False)
plt.title('WindSpeed9am vs WindSpeed3pm by Date');

In [None]:
plt.subplots(figsize=(18,6))
plt.plot(df['Date'], df['Pressure9am'],color='blue', linewidth=2, label= 'WindSpeed9am')
plt.plot(df['Date'], df['Pressure3pm'],color='green', linewidth=2, label= 'WindSpeed3pm')
plt.legend(frameon=False)
plt.title('Pressure9am vs Pressure3pm by Date');

In [None]:
columns = data.columns[2:17]
plt.subplots(figsize=(20,30))
length = len(columns)

for i, j in zip(columns, range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.boxplot(y=data[i])
    plt.title(i)

<p style='font-family: "Lucida Console", "Courier New", monospace;font-size:125%; '>The outliers will not be removed, because in periods of rain we will have values much higher than in periods without rain.</p>

<a name="h2o">

# <p style="background-color:#1c56c9; font-family:newtimeroman; font-size:150%; text-align:center; border-radius:  25px; color:#ffffff; padding-top:5px; padding-bottom:5px;">H2O</p>

In [None]:
h2o.init(nthreads= -1)

In [None]:
data = h2o.H2OFrame(data)

In [None]:
data['Location'] = data['Location'].asfactor()
data['WindGustDir'] = data['WindGustDir'].asfactor()
data['WindDir9am'] = data['WindDir9am'].asfactor()
data['WindDir3pm'] = data['WindDir3pm'].asfactor()
data['RainToday'] = data['RainToday'].asfactor()
data['RainTomorrow'] = data['RainTomorrow'].asfactor()

In [None]:
features = ['Location','MinTemp','MaxTemp','Rainfall','WindGustDir','WindGustSpeed',
            'WindDir9am','WindDir3pm','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am',
            'Pressure3pm','Temp9am','Temp3pm','RainToday','Day','Month','Year']

In [None]:
split = data.split_frame(ratios = [0.8])

train = split[0]
test = split[1] 

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

glm_default = H2OGeneralizedLinearEstimator(family = 'binomial', model_id = 'glm_default', 
                                            keep_cross_validation_predictions = True, nfolds =5, 
                                            fold_assignment="stratified", balance_classes=True)

glm_default.train(x = features, y = 'RainTomorrow', training_frame = train)

In [None]:
plt.figure(figsize=(5,5))
variable = glm_default.varimp_plot()
plt.show()

In [None]:
performace = glm_default.model_performance(train=True)
performace.plot();

In [None]:
performace = glm_default.model_performance(test)
performace.plot()

In [None]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

drf_default = H2ORandomForestEstimator(model_id='drf_default', keep_cross_validation_predictions=True,
                                       nfolds=5, fold_assignment="stratified", balance_classes=True)

drf_default.train(x = features, y='RainTomorrow', training_frame=train)

In [None]:
plt.figure(figsize=(5,5))
variable = drf_default.varimp_plot()
plt.show()

In [None]:
performace = drf_default.model_performance(train=True)
performace.plot()

In [None]:
performace = drf_default.model_performance(test)
performace.plot()