
# Rain in Australia



### Context
Predict whether or not it will rain tomorrow by training a binary
classification model on target RainTomorrow

### Content
This dataset contains daily weather observations from numerous Australian weather stations.

The target variable RainTomorrow means: Did it rain the next day? Yes or No.

Note: You should exclude the variable Risk-MM when training a binary
classification model. Not excluding it will leak the answers to your
 model and reduce its predictability.

### Acknowledgements
Observations were drawn from numerous weather stations.
The daily observations are available from http://www.bom.gov.au/climate/data.
Copyright Commonwealth of Australia 2010, Bureau of Meteorology.


In [None]:

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sn
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline



## Read dataset


In [None]:

dataset = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')


In [None]:

dataset.head()


In [None]:

dataset.info()


In [None]:

dataset.describe()



## Data Exploration


In [None]:

dataset = dataset.drop(['Date', 'RISK_MM'], axis=1)


In [None]:

target = 'RainTomorrow'


In [None]:

col = dataset.columns       # .columns gives columns names in data
print(col)


In [None]:

features = col[:-1]


In [None]:

dataset[target].value_counts()


In [None]:

sns.countplot(x=target, data=dataset, palette="bwr")
plt.show()



In [None]:

countRain= len(dataset[dataset[target] == 'Yes'])
countNotRain = len(dataset[dataset[target] == 'No'])
print("Rain Tomorrow: {:.2f}%".format((countRain / (len(dataset[target]))*100)))
print("Not rain tomorrow: {:.2f}%".format((countNotRain / (len(dataset[target]))*100)))


In [None]:

dataset['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
dataset['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)


In [None]:

dataset[features].hist(figsize=(12,12), bins=20)
plt.show()


In [None]:

# Next let's check the relationship between parameters of blue team features
g = sns.PairGrid(data=dataset, vars=['MinTemp', 'MaxTemp', 'Rainfall',
                                     'Evaporation', 'Sunshine','WindGustSpeed'
                                     ], hue=target, size=3, palette='Set1')
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()



## Data Analysis


In [None]:

data_map = dataset[[target]]
data_map[features] = dataset[features]
plt.figure(figsize=(15,15))
sns.heatmap(data_map.corr(), annot=True, square=True, cmap='coolwarm')
plt.show()


In [None]:

list_cor = pd.DataFrame(dataset.corr().unstack().sort_values().drop_duplicates())
list_cor.columns = ['correlation_index']
list_cor[(list_cor['correlation_index'] > 0.9) | (list_cor['correlation_index'] < -0.9)]



#### Missing data


In [None]:

total = dataset[features].isnull().sum().sort_values(ascending = False)
percent = (dataset[features].isnull().sum()/dataset[features].isnull().count()*100).sort_values(ascending = False)
missing  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing




## Clean Dataset


In [None]:

duplicated_data = dataset.duplicated()
dataset[duplicated_data]


In [None]:

dataset.drop_duplicates(keep = False, inplace = True)


In [None]:

duplicated_data= dataset.duplicated()
dataset[duplicated_data]



## outliers in numerical variables


In [None]:

def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])


In [None]:

# find outliers for Rainfall variable

IQR = dataset.Rainfall.quantile(0.75) - dataset.Rainfall.quantile(0.25)
Lower_fence = dataset.Rainfall.quantile(0.25) - (IQR * 3)
Upper_fence = dataset.Rainfall.quantile(0.75) + (IQR * 3)
print('Rainfall outliers are values < {lowerboundary} or > {upperboundary}'.format(
    lowerboundary=Lower_fence, upperboundary=Upper_fence))


In [None]:

# find outliers for Evaporation variable

IQR = dataset.Evaporation.quantile(0.75) - dataset.Evaporation.quantile(0.25)
Lower_fence = dataset.Evaporation.quantile(0.25) - (IQR * 3)
Upper_fence = dataset.Evaporation.quantile(0.75) + (IQR * 3)
print('Evaporation outliers are values < {lowerboundary} or > {upperboundary}'.format(
    lowerboundary=Lower_fence, upperboundary=Upper_fence))


In [None]:

# find outliers for WindSpeed9am variable

IQR = dataset.WindSpeed9am.quantile(0.75) - dataset.WindSpeed9am.quantile(0.25)
Lower_fence = dataset.WindSpeed9am.quantile(0.25) - (IQR * 3)
Upper_fence = dataset.WindSpeed9am.quantile(0.75) + (IQR * 3)
print('WindSpeed9am outliers are values < {lowerboundary} or > {upperboundary}'.format(
    lowerboundary=Lower_fence, upperboundary=Upper_fence))


In [None]:

# find outliers for WindSpeed3pm variable

IQR = dataset.WindSpeed3pm.quantile(0.75) - dataset.WindSpeed3pm.quantile(0.25)
Lower_fence = dataset.WindSpeed3pm.quantile(0.25) - (IQR * 3)
Upper_fence = dataset.WindSpeed3pm.quantile(0.75) + (IQR * 3)
print('WindSpeed3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(
    lowerboundary=Lower_fence, upperboundary=Upper_fence))


In [None]:

dataset['Rainfall'] = max_value(dataset, 'Rainfall', 3.2)
dataset['Evaporation'] = max_value(dataset, 'Evaporation', 21.8)
dataset['WindSpeed9am'] = max_value(dataset, 'WindSpeed9am', 55)
dataset['WindSpeed3pm'] = max_value(dataset, 'WindSpeed3pm', 57)



## Data Preprocessing



### Handling Class Imbalance


In [None]:
no = dataset[dataset.RainTomorrow == 0]
yes = dataset[dataset.RainTomorrow == 1]
yes_oversampled = resample(yes, replace=True, n_samples=len(no), random_state=123)
oversampled = pd.concat([no, yes_oversampled])

fig = plt.figure(figsize = (8,5))
oversampled.RainTomorrow.value_counts(normalize = True).plot(kind='bar', color= ['skyblue','navy'], alpha = 0.9, rot=0)
plt.title('RainTomorrow Indicator No(0) and Yes(1) after Oversampling (Balanced Dataset)')
plt.show()


In [None]:

y = oversampled[target]
X = oversampled.drop([target], axis=1)


In [None]:

categorical_columns  = [col for col in X.columns if X[col].dtypes == 'O']
numerical_columns = list(set(X.columns) - set(categorical_columns) )
categorical_columns


In [None]:

X = pd.concat([X[numerical_columns],
                     pd.get_dummies(X.Location, prefix='Location'),
                     pd.get_dummies(X.WindGustDir, prefix='WindGustDir'),
                     pd.get_dummies(X.WindDir9am, prefix='WindDir9am'),
                     pd.get_dummies(X.WindDir3pm, prefix='WindDir3pm')], axis=1)


X.head()

In [None]:

numerical_columns = list(X._get_numeric_data().columns)
categorical_columns = list(set(X.columns) - set(numerical_columns))
categorical_columns


In [None]:

numerical_pipeline = Pipeline([
        ('data_filler', SimpleImputer(strategy="median")),
        ('std_scaler', MinMaxScaler()),
    ])


In [None]:

transformer = ColumnTransformer([
    ("numerical", numerical_pipeline, numerical_columns)
])


In [None]:
X, X_validation, y, y_validation = train_test_split(X, y, test_size = 0.3, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)


In [None]:

y_train.value_counts()


In [None]:

y_test.value_counts()

In [None]:

y_validation.value_counts()


## Models


In [None]:

def plot_matrix(y_test, y_pred):
    data = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(y_test))
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (5,3))
    sn.set(font_scale=1.4) #for label size
    sn.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 10})# font size


In [None]:

def train_ensemble_models(X, y):
    clf1 = DecisionTreeClassifier(random_state=0)
    #clf2 = RandomForestClassifier(random_state=0)
    clf3 = xgb.XGBClassifier()

    for clf, label in zip([clf1, clf3], ['DecisionTreeClassifier', 'XGBClassifier']):
        execute_pipeline(clf, X, y, label)


In [None]:

def execute_pipeline(clf, X, y, title):
    pipe = Pipeline([
        ('transformer',transformer),
        ('reduce_dim', 'passthrough'),
        ('classify', clf)
    ])

    N_FEATURES_OPTIONS = [16, 64]

    param_grid = [
          {
            'reduce_dim': [PCA()],
            'reduce_dim__n_components': N_FEATURES_OPTIONS
        },
        {
            'reduce_dim': [SelectKBest()],
            'reduce_dim__k': N_FEATURES_OPTIONS
        },
    ]
    reducer_labels = ['PCA', 'KBest']

    grid = GridSearchCV(pipe,  param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1, return_train_score=True)
    grid.fit(X, y)

    mean_train_scores = np.array(grid.cv_results_['mean_train_score'])
    mean_scores = np.array(grid.cv_results_['mean_test_score'])
    mean_scores = mean_scores.reshape(2, len(N_FEATURES_OPTIONS))
    bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) * (len(reducer_labels) + 1) + .5)

    plt.figure()
    COLORS = 'bgrcmyk'
    for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
        plt.bar(bar_offsets + i, mean_train_scores[i], label='{} train'.format(label),alpha=.7)
        plt.bar(bar_offsets + i, reducer_scores, label='{} test'.format(label), color=COLORS[i])

    plt.title(title)
    plt.xlabel('Number of features')
    plt.xticks(bar_offsets + len(reducer_labels) / 2, N_FEATURES_OPTIONS)
    plt.ylabel('Classification accuracy')
    plt.ylim((0, 1))
    plt.legend(bbox_to_anchor=(0,1), loc="upper right", bbox_transform=plt.gcf().transFigure)
    plt.show()


In [None]:

def train_best_model(transformer, clf, parameters, k_best, X_train, y_train, X_test, y_test, X_validation, y_validation):

    pipeline = Pipeline([
        ('transformer', transformer),
        ('reduce_dim', SelectKBest(k=k_best)),
        ('clf', clf)])
    clf = GridSearchCV(pipeline,
                        parameters,
                        scoring='accuracy',
                        cv=5)
    clf.fit(X_train, y_train)
    print(clf.best_params_)
    model = clf.best_estimator_.named_steps['clf']

    reduction = SelectKBest(k=k_best)

    X_train_transformer = transformer.fit_transform(X_train)
    X_test_transformer = transformer.transform(X_validation)

    X_train_reduction_transformer = reduction.fit_transform(X_train_transformer, y_train)
    X_test_reduction_transformer = reduction.transform(X_test_transformer)

    model.fit(X_train_reduction_transformer, y_train)
    y_predict = model.predict(X_test_reduction_transformer)

    print(classification_report(y_predict, y_validation))
    plot_matrix(y_validation, y_predict)


In [None]:

grid_result = train_ensemble_models(X_train, y_train)


In [None]:

def plot_best_model(model, k_best, parameters):
    train_best_model(transformer, model, parameters, k_best,
                     X_train, y_train,
                     X_test, y_test,
                     X_validation, y_validation)


In [None]:

params_dt = {'clf__max_depth': [5, 16],
             'clf__max_features': ['sqrt']}

model_dt = DecisionTreeClassifier(random_state=42)


params_rf = {'clf__max_depth': [5, 16],
             'clf__min_samples_leaf': [1,5],
             'clf__min_samples_split': [2,5],
             'clf__n_estimators':[10, 100]}

model_rf = RandomForestClassifier(random_state=42)


params_xgb ={'clf__n_estimators': [10, 100],
            'clf__max_depth': [8,16]}

model_xgb = xgb.XGBClassifier()



In [None]:

plot_best_model(model_dt, 16, params_dt)


In [None]:

plot_best_model(model_rf, 16, params_rf)


In [None]:

plot_best_model(model_xgb, 16, params_xgb)


## Conclusion



**Decision Tree Classifier**: 81.00%

**Random Forest Classifier**: 87.00%

**XGB Classifier**: 90.00%




It was noticed that the dataset has a lot of missing data and has a difference
between classes, for this reason it is necessary to do a work of feature engineering.
In conclusion, we understand that of the 3 models analyzed, XGBClassifier with 90% accuracy
and acceptable recalls was the one that had the best results, signaling that it was not overfit.
For future work, I would use more techniques with ROC or
other models in the literature such as logistic regrecisson and neural networks