# Rainfall Prediction with XGBoost
This notebook uses XGBoost trained on Australian weather and rainfall data to predict rainfall on the next day. 

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer\
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from xgboost import XGBClassifier

from matplotlib import pyplot as plt

## Load the Data
Load the data and convert/engineer useful columns. In particular, we'll convert the wind direction strings into sine and cosine values. 

In [None]:
rain_df = pd.read_csv(
    '/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv',
    dtype={'Date':str, 
           'Location':'category',
          },
)
rain_df['Date'] = pd.to_datetime(rain_df['Date'], format='%Y-%m-%d')
rain_df['Location Index'] = rain_df['Location'].cat.codes



def dir_string_to_theta(dir_string):
    """Convert a compass direction string into math theta float."""
    dir_theta = {'N':90, 'E':0, 'S':-90, 'W':180}
    if type(dir_string) is str:
        theta = dir_theta[dir_string[-1]]
        dir_string_ = str(dir_string[:-1])
        while len(dir_string_) > 0:
            theta = np.mean([dir_theta[dir_string_[-1]], theta])
            dir_string_ = dir_string_[:-1]
    else:
        theta = np.nan
    return theta

rain_df['WindGustTheta'] = rain_df['WindGustDir'].apply(dir_string_to_theta)
rain_df['WindGustCosine'] = rain_df['WindGustTheta'].apply(np.cos)
rain_df['WindGustSine'] = rain_df['WindGustTheta'].apply(np.sin)

rain_df['Wind9amTheta'] = rain_df['WindDir9am'].apply(dir_string_to_theta)
rain_df['Wind9amCosine'] = rain_df['Wind9amTheta'].apply(np.cos)
rain_df['Wind9amSine'] = rain_df['Wind9amTheta'].apply(np.sin)

rain_df['Wind3pmTheta'] = rain_df['WindDir3pm'].apply(dir_string_to_theta)
rain_df['Wind3pmCosine'] = rain_df['Wind3pmTheta'].apply(np.cos)
rain_df['Wind3pmSine'] = rain_df['Wind3pmTheta'].apply(np.sin)



def convert_yn_to_bool(yn_string):
    """Convert a Yes or No string to a True or False bool."""
    if type(yn_string) is str:
        if 'y' in yn_string.lower():
            return True
        elif 'n' in yn_string.lower():
            return False
        else:
            return np.nan
    else:
        return np.nan
    
rain_df['RainToday'] = rain_df['RainToday'].apply(convert_yn_to_bool)
rain_df['RainTomorrow'] = rain_df['RainTomorrow'].apply(convert_yn_to_bool)

rain_df

## Prepare the Data for Model Input

Select the desired columns and specify the column to predict, in this case whether or not it rains in the area on the next day. We will split into test and train, as well as impute using train statistics. 

In [None]:
# specify which columns to use as input features
columns_to_use = [
    'MinTemp', 
    'MaxTemp', 
    'Rainfall', 
    'Evaporation',
    'Sunshine', 
    'WindGustCosine', 
    'WindGustSine', 
    'Wind9amCosine', 
    'Wind9amSine', 
    'Wind3pmCosine',
    'Wind3pmSine',
    'WindGustSpeed', 
    'WindSpeed9am', 
    'WindSpeed3pm', 
    'Humidity9am', 
    'Humidity3pm',
    'Pressure9am', 
    'Pressure3pm', 
    'Cloud9am', 
    'Cloud3pm', 
    'Temp9am',
    'Temp3pm', 
    'RainToday'
]

# column we're predicting 
column_to_predict = 'RainTomorrow'

# we don't want rows with an unkown "RainTomorrow" field
df_to_use = rain_df[~rain_df[column_to_predict].isna()].copy()
X = df_to_use[columns_to_use].astype(float)
y = df_to_use[column_to_predict].astype(float)

# split into test and train
df_train,df_test, X_train,X_test, y_train,y_test = train_test_split(
    df_to_use, X, y, 
    train_size=0.9, 
    random_state=0
)

# impute the data with mean values
imputer = SimpleImputer()
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

## Fit the Model
We'll use XGBoost as our model. Fit on the train data, then generate class probabilities for the test set for use later. 

In [None]:
model = XGBClassifier()

model.fit(X_train, y_train)
probs = model.predict_proba(X_test)

## Visualize the Performance of the Model
Plot the ROC curve and precision-recall curve for the model on the test set. 

In [None]:
fig,axes = plt.subplots(1, 2)

metrics.plot_roc_curve(model, X_test, y_test, ax=axes[0])
axes[0].set_xlim([-0.1,1.1])
axes[0].set_ylim([-0.1,1.1])
axes[0].set_aspect('equal')
axes[0].set_title('ROC Curve')

metrics.plot_precision_recall_curve(model, X_test, y_test, ax=axes[1])
axes[1].set_xlim([-0.1,1.1])
axes[1].set_ylim([-0.1,1.1])
axes[1].set_aspect('equal')
axes[1].set_title('Precision-Recall Curve')


fig.set_size_inches(10, 5)
plt.show()

## Visualize Model Performance at Different Locations

Repeat the visualizations for the test set, but generate a different curve for each location in the dataset. 

In [None]:
fig,axes = plt.subplots(2, 2)
aurocs, aps, apprevs = [], [], []
for location in df_to_use.Location.value_counts().index:
    
    X_loc = X_test[df_test.Location.eq(location)]
    y_loc = y_test[df_test.Location.eq(location)]

    roc_display = metrics.plot_roc_curve(
        model, 
        X_loc, 
        y_loc, 
        ax=axes[0][0],
        alpha=0.5
    )
    aurocs.append(roc_display.roc_auc)
    
    pr_display = metrics.plot_precision_recall_curve(
        model, 
        X_loc, 
        y_loc, 
        ax=axes[0][1],
        alpha=0.5
    )
    aps.append(pr_display.average_precision)
    apprevs.append(pr_display.average_precision/y_loc.mean())
    
axes[0][0].set_xlim([-0.1,1.1])
axes[0][0].set_ylim([-0.1,1.1])
axes[0][0].set_aspect('equal')
axes[0][0].set_title('ROC Curve')
axes[0][0].get_legend().remove()

axes[0][1].set_xlim([-0.1,1.1])
axes[0][1].set_ylim([-0.1,1.1])
axes[0][1].set_aspect('equal')
axes[0][1].set_title('Precision-Recall Curve')
axes[0][1].get_legend().remove()
    
axes[1][0].hist(aurocs, bins=25)
axes[1][0].set_title('Histogram of AUROC Scores')
axes[1][1].hist(apprevs, bins=25)
axes[1][1].set_title('Histogram of AP:Prevalence Scores')

fig.set_size_inches(10, 10)
plt.show()

print(
    'AUROC = {0:.3f}+/-{1:.3f}'.format(
        np.mean(aurocs),
        np.std(aurocs)
    )
)
print('AP = {0:.3f}+/-{1:.3f}'.format(np.mean(aps), np.std(aps)))

In [None]:
# from boruta import BorutaPy

# model = XGBClassifier()
# feat_selector = BorutaPy(model, 
#                          n_estimators='auto', 
#                          verbose=2, 
#                          random_state=1)
# feat_selector.fit(X_train, y_train)