# Rain in Australia
**Task:** predict categorical value "RainTomorrow"

**Model:** CatBoost Classifier

**Metrics:** Accuracy, F1, ROC AUC
<img src=https://news-images.weatherzone.com.au/twc/WA%20March%20heat%20records%2020190311.png width="500">

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, f1_score
from catboost import CatBoostClassifier
from tqdm import tqdm
from IPython.display import clear_output
from tensorflow import keras
from keras.layers import Dense, LSTM, Dropout
import pickle
import seaborn as sns
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
def validate(model, val_data):
    y = model.predict(val_data[0])
    print('Accuracy =', accuracy_score(y, val_data[1]))
    print('ROC AUC =', roc_auc_score(y, val_data[1]))
    print('F1 =', f1_score(y, val_data[1]))

# 1. Import Data

In [None]:
orig_data = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
data = orig_data.copy()

In [None]:
data.describe()

# 2. Data Preprocessing

In [None]:
cat, num = [], [] # find categorical and float columns
for col in data.drop(columns=['Date', 'RainTomorrow']).columns:
    if data[col].dtype == np.number:
        num.append(col)
    else:
        cat.append(col)

In [None]:
data.dropna(inplace=True)

In [None]:
# Insert day and month
data['Date'] = pd.to_datetime(data['Date'])
day, month = np.array([], dtype='int8'), np.array([], dtype='int8')
with tqdm(total=len(data)) as pb:
    for index, val in data['Date'].iteritems():
        day = np.append(day, val.day)
        month = np.append(month, val.month)
        pb.update(1)
data.insert(0, 'Day', day)
data.insert(0, 'Month', month)
data.drop(columns='Date', inplace=True)
cat += ['Day', 'Month']

In [None]:
# One hot encoding
data = pd.get_dummies(data, columns=cat)

In [None]:
data['RainTomorrow'] = data['RainTomorrow'].astype('category').cat.codes

In [None]:
data.info()

# 3. Feature Engineering

In [None]:
# Add information about before days
data.insert(0, 'Last_days', data['RainTomorrow'].rolling(15).sum().shift(1))
data = data[15:]

# 4. Train Test Split

In [None]:
X = data.drop(columns='RainTomorrow')
y = data['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=45)
val_data = (X_test, y_test)
cat_f = [] # Categorical columns for catboost
for col in X.columns:
    if X[col].dtype == np.uint8:
        cat_f.append(col)

# 5. Model Training

In [None]:
%%time
model = CatBoostClassifier()
model.fit(X_train, y_train, verbose=0, cat_features=cat_f)

In [None]:
validate(model, val_data)

# 6. Optimizing Hyperparameters

In [None]:
model_tun = CatBoostClassifier()
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}
model_tun.randomized_search(grid, X=X, y=y)
clear_output()

In [None]:
validate(model_tun, val_data)

# 7. Draw ROC CURVE

In [None]:
sns.set(font_scale=1.5)
sns.set_color_codes("muted")
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = roc_curve(y_test, model_tun.predict_proba(X_test)[:,1], pos_label=1)
lw = 2
plt.plot(fpr, tpr, lw=lw, label='ROC curve ')
plt.plot([0, 1], [0, 1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.show()