In [None]:
!pip install ipython-autotime
%load_ext autotime

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.model_selection import KFold
import multiprocessing as mp
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
mp.cpu_count()

In [None]:
draw = False

# LOAD DATA

In [None]:
df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
# df = pd.read_csv("weatherAUS.csv")
df

- Date: date of observation (*)
- Location: common name of the weather station
- MinTemp: minimum temperature
- MaxTemp: maximum temperature
- Rainfall: amount of rainfall recorded
- Evaporation: evaporation
- Sunshine: number of hours of bright sunshine
- WindGustDir: direction of strongest wind gust (*)
- WindGustSpeed: speed of strongest wind gust
- WindDir9am: direction of wind at 9am (*)
- WindDir3pm: direction of wind at 3pm (*)
- WindSpeed9am: speed of wind at least 10 mins prior to 9am 
- WindSpeed3pm: speed of wind at least 10 mins prior to 3pm 
- Humidity9am: humidity at 9am
- Humidity3pm: humidity at 3pm
- Pressure9am: pressure at 9am
- Pressure3pm: pressure at 3pm
- Cloud9am: fraction of cloud at 9am
- Cloud3pm: fraction of cloud at 3pm
- Temp9am: temperature at 9am
- Temp3pm: temperature at 3pm
- RainToday: binary
- RiskMM: millimeter of rain 

In [None]:
df.isnull().sum()

# ADDING NEW FEATURES
- Date -> Year, Month, Day
- Dir -> Sin, Cos
- Added yesterday data

In [None]:
# adding features
# split year month day
df["Year"] = df["Date"].map(lambda x: x.split("-")[0])
df["Month"] = df["Date"].map(lambda x: x.split("-")[1])
df["Day"] = df["Date"].map(lambda x: x.split("-")[2])

# wind dir to point on circle
for col in ["WindGustDir", "WindDir9am", "WindDir3pm"]:
    df[col + "Sin"] = df[col].map({
        "E": 0.0, "N": 1.0, "W": 0.0, "S": -1.0,
        "NE": np.sqrt(2), "NW": np.sqrt(2), "SW": -np.sqrt(2), "SE": -np.sqrt(2),
        "ENE": np.sqrt(2 - np.sqrt(2)) / 2, "NNE": np.sqrt(2 + np.sqrt(2)) / 2, "NNW": np.sqrt(2 + np.sqrt(2)) / 2, "WNW": np.sqrt(2 - np.sqrt(2)) / 2,
        "WSW": - np.sqrt(2 - np.sqrt(2)) / 2, "SSW": - np.sqrt(2 + np.sqrt(2)), "SSE": - np.sqrt(2 + np.sqrt(2)), "ESE": - np.sqrt(2 - np.sqrt(2)) / 2,
        float('nan'): 0.0,
    })
    df[col + "Cos"] = df[col].map({
        "E": 1.0, "N": 0.0, "W": -1.0, "S": 0.0,
        "NE": np.sqrt(2), "NW": -np.sqrt(2), "SW": -np.sqrt(2), "SE": np.sqrt(2),
        "ENE": np.sqrt(2 + np.sqrt(2)) / 2, "NNE": np.sqrt(2 - np.sqrt(2)) / 2, "NNW": - np.sqrt(2 - np.sqrt(2)) / 2, "WNW": - np.sqrt(2 + np.sqrt(2)) / 2,
        "WSW": - np.sqrt(2 + np.sqrt(2)) / 2, "SSW": - np.sqrt(2 - np.sqrt(2)), "SSE": np.sqrt(2 - np.sqrt(2)), "ESE": np.sqrt(2 + np.sqrt(2)) / 2,
        float('nan'): 0.0,
    })
df

# 1daybefore
for col in df.columns:
    df[col + "1daybefore"] = df[col]
    df[col + "1daybefore"][1:len(df[col])] = df[col][0:len(df[col])-1]


In [None]:
df.columns

# ALL FEATURES

In [None]:
categorical_features = ["Location", "Year", "Month", "Day", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos",]
target_feature = "RainTomorrow"

categorical_features.extend(list(map(lambda col: col + "1daybefore", categorical_features)))
for discard in ["Location1daybefore", "Year1daybefore", "Month1daybefore", "Day1daybefore"]:
    categorical_features.remove(discard)
    
numerical_features.extend(list(map(lambda col: col + "1daybefore", numerical_features)))

In [None]:
categorical_features

In [None]:
numerical_features

# DROP NAN ON TARGET

In [None]:
# drop nan on target_feature
df = df[~df[target_feature].isnull()]

# FILL NAN ON CATEGORICAL FEATURES

In [None]:
df[categorical_features].isnull().sum()

In [None]:
for col in categorical_features:
    value = df[col].mode()[0]
    df[col].fillna(value, inplace=True)
    
df[categorical_features].isnull().sum()

# FILL NAN ON NUMERICAL FEATURES

In [None]:
df[numerical_features].isnull().sum()

In [None]:
for col in numerical_features:
    value = df[col].median()
    df[col].fillna(value, inplace=True)
    
df[numerical_features].isnull().sum()

# DRAW

In [None]:
if draw:
    _, ax = plt.subplots(nrows=len(categorical_features), ncols=1, figsize=(12.8, 5.4*len(categorical_features)))
    ax = ax.flatten()
    for i, feature in enumerate(categorical_features):
        sns.countplot(x=feature, hue=target_feature , data=df, ax=ax[i])
    plt.tight_layout()

In [None]:
if draw:
    _, ax = plt.subplots(nrows=len(numerical_features), ncols=1, figsize=(12.8, 5.4*len(numerical_features)))
    ax = ax.flatten()
    for i, feature in enumerate(numerical_features):
        sns.histplot(x=feature, hue=target_feature , data=df, ax=ax[i])
    plt.tight_layout()

In [None]:
if draw:
    df[target_feature + "_01"] = df[target_feature].map({"No": 0.0, "Yes": 1.0})
    _, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 20))
    sns.heatmap(df[numerical_features + [target_feature + "_01"]].corr(), annot=True, ax=ax)

In [None]:
# TRAIN TEST SPLIT
def to_Xy(df, categorical_features, numerical_features, target_feature, onehot=False):
    """one hot encoding for categorical features"""
    df = df.copy()
    # X
    X = []
    for col in categorical_features:
        df[col] = LabelEncoder().fit_transform(df[col])
        if onehot:
            X.append(pd.get_dummies(df[col], prefix=col))
        else:
            X.append(df[col])
    for col in numerical_features:
        df[col] = df[col].astype(np.float64)
        X.append(df[col])
            
    X = pd.concat(X, axis=1)
    
    # y
    if target_feature in df.columns:
        df[target_feature] = LabelEncoder().fit_transform(df[target_feature])
        y = df[target_feature]
    else:
        y = None
    return X, y

X, y = to_Xy(df, categorical_features, numerical_features, target_feature, onehot=False)

In [None]:
X

In [None]:
y

# CROSS VALIDATION

In [None]:
def cross_validation(clf, features, k=10, random_state=0):
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    
    auc_test_avg = []
    for train_idx, test_idx in kf.split(np.arange(len(y))):
        # resampling
        X_ = []
        y_ = []
        for label in y.iloc[train_idx].unique():
            X_.append(X.iloc[train_idx, :][y.iloc[train_idx] == label])
            y_.append(y.iloc[train_idx][y.iloc[train_idx] == label])
        n_samples = max(yy.shape[0] for yy in y_)
        for i in range(len(y_)):
            if len(y_[i]) < n_samples:
                X_[i], y_[i] = resample(X_[i], y_[i], n_samples=n_samples, replace=True)

        X_train = pd.concat(X_)
        y_train = pd.concat(y_)
        X_test = X.iloc[test_idx]
        y_test = y.iloc[test_idx]

        # train model
        clf.fit(X_train[features] , y_train)
        y_train_pred = clf.predict_proba(X_train[features])[:, 1]
        auc_train = roc_auc_score(y_train, y_train_pred)
        y_test_pred = clf.predict_proba(X_test[features])[:, 1]
        auc_test = roc_auc_score(y_test, y_test_pred)

        auc_test_avg.append(auc_test)

    auc_test_avg = sum(auc_test_avg) / len(auc_test_avg)
    return auc_test_avg

In [None]:
# logistic regression - baseline
categorical_features = ["Location", "Year", "Month", "Day", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos"]
features = categorical_features + numerical_features    

auc = cross_validation(
    clf=make_pipeline(StandardScaler(), LogisticRegression(
        max_iter=100,
        random_state=0,
        n_jobs=mp.cpu_count(),
    )),
    features=features,
)
auc

In [None]:
# logistic regression - without categorical dir
categorical_features = ["Location", "Year", "Month", "Day", "RainToday"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos"]
features = categorical_features + numerical_features    

auc = cross_validation(
    clf=make_pipeline(StandardScaler(), LogisticRegression(
        max_iter=100,
        random_state=0,
        n_jobs=mp.cpu_count(),
    )),
    features=features,
)
auc

In [None]:
# logistic regression - without sin cos dir
categorical_features = ["Location", "Year", "Month", "Day", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm"]
features = categorical_features + numerical_features    

auc = cross_validation(
    clf=make_pipeline(StandardScaler(), LogisticRegression(
        max_iter=100,
        random_state=0,
        n_jobs=mp.cpu_count(),
    )),
    features=features,
)
auc

In [None]:
# logistic regression - without year month date
categorical_features = ["Location", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos"]
features = categorical_features + numerical_features    

auc = cross_validation(
    clf=make_pipeline(StandardScaler(), LogisticRegression(
        max_iter=100,
        random_state=0,
        n_jobs=mp.cpu_count(),
    )),
    features=features,
)
auc  

In [None]:
# logistic regression - 1daybefore
categorical_features = ["Location", "Year", "Month", "Day", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos",]
target_feature = "RainTomorrow"

categorical_features.extend(list(map(lambda col: col + "1daybefore", categorical_features)))
for discard in ["Location1daybefore", "Year1daybefore", "Month1daybefore", "Day1daybefore"]:
    categorical_features.remove(discard)

numerical_features.extend(list(map(lambda col: col + "1daybefore", numerical_features)))

features = categorical_features + numerical_features    

auc = cross_validation(
    clf=make_pipeline(StandardScaler(), LogisticRegression(
        max_iter=100,
        random_state=0,
        n_jobs=mp.cpu_count(),
    )),
    features=features,
)
auc

In [None]:
# xgboost - 1daybefore
categorical_features = ["Location", "Year", "Month", "Day", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos",]
target_feature = "RainTomorrow"

categorical_features.extend(list(map(lambda col: col + "1daybefore", categorical_features)))
for discard in ["Location1daybefore", "Year1daybefore", "Month1daybefore", "Day1daybefore"]:
    categorical_features.remove(discard)
    
numerical_features.extend(list(map(lambda col: col + "1daybefore", numerical_features)))

features = categorical_features + numerical_features    

auc = cross_validation(
    clf=XGBClassifier(
        n_estimators=100,
        random_state=0,
        n_jobs=mp.cpu_count(),
        eval_metric="auc",
    ),
    features=features,
)
auc

# FEATURE IMPORTANCE FROM XGBOOST

In [None]:
# logistic regression - 1daybefore
categorical_features = ["Location", "Year", "Month", "Day", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos",]
target_feature = "RainTomorrow"

categorical_features.extend(list(map(lambda col: col + "1daybefore", categorical_features)))
for discard in ["Location1daybefore", "Year1daybefore", "Month1daybefore", "Day1daybefore"]:
    categorical_features.remove(discard)
    
numerical_features.extend(list(map(lambda col: col + "1daybefore", numerical_features)))

features = categorical_features + numerical_features    

# train
clf = XGBClassifier(
    n_estimators=1000,
    random_state=0,
    n_jobs=mp.cpu_count(),
    eval_metric="auc",
)
clf.fit(X , y)

y_pred = clf.predict_proba(X)[:, 1]
auc = roc_auc_score(y, y_pred)
auc

In [None]:
feature_importance = clf.feature_importances_
_, ax = plt.subplots()
ax.plot(np.arange(len(feature_importance)), feature_importance)

In [None]:
feature_importance = clf.feature_importances_
_, ax = plt.subplots()
ax.plot(np.arange(len(feature_importance)), np.sort(feature_importance))

In [None]:
features = np.array(features)[np.argsort(feature_importance)[::-1]]
features

In [None]:
def objective(num_features: int, n_estimators: int, max_depth: int, reg_alpha: float, reg_lambda: float):
    chosen_features = features[0: int(round(num_features))]
    auc = cross_validation(
        clf=XGBClassifier(
            n_estimators=int(round(n_estimators)),
            max_depth=int(round(max_depth)),
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            random_state=0,
            n_jobs=mp.cpu_count(),
            eval_metric="auc",
        ),
        features=chosen_features,
    )
    return auc

In [None]:
pbounds = {
    "num_features": (1, 10),
    "n_estimators": (50, 100),
    "max_depth": (2, 5),
    "reg_alpha": (0.0, 1.0),
    "reg_lambda": (0.0, 1.0),
}

optimizer = BayesianOptimization(
    f=objective,
    pbounds=pbounds,
    random_state=0,
)

optimizer.maximize(n_iter=50)

In [None]:
optimizer.max

# FEATURE IMPORTANCE FROM CATBOOST

In [None]:
# logistic regression - 1daybefore
categorical_features = ["Location", "Year", "Month", "Day", "RainToday", "WindGustDir", "WindDir9am", "WindDir3pm"]
numerical_features = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm", "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm", "WindDir9amSin", "WindDir9amCos", "WindDir3pmSin", "WindDir3pmCos", "WindGustDirSin", "WindGustDirCos",]
target_feature = "RainTomorrow"

categorical_features.extend(list(map(lambda col: col + "1daybefore", categorical_features)))
for discard in ["Location1daybefore", "Year1daybefore", "Month1daybefore", "Day1daybefore"]:
    categorical_features.remove(discard)
    
numerical_features.extend(list(map(lambda col: col + "1daybefore", numerical_features)))

features = categorical_features + numerical_features    

# train
clf = CatBoostClassifier(
    iterations=5000,
    random_state=0,
    thread_count=mp.cpu_count(),
    eval_metric="AUC",
    verbose=False,
)
clf.fit(X , y)

y_pred = clf.predict_proba(X)[:, 1]
auc = roc_auc_score(y, y_pred)
auc

In [None]:
feature_importance = clf.feature_importances_
_, ax = plt.subplots()
ax.plot(np.arange(len(feature_importance)), feature_importance)

In [None]:
feature_importance = clf.feature_importances_
_, ax = plt.subplots()
ax.plot(np.arange(len(feature_importance)), np.sort(feature_importance))

In [None]:
features = np.array(features)[np.argsort(feature_importance)[::-1]]
features

In [None]:
def objective(num_features: int, iterations: int, depth: int, l2_leaf_reg: float, model_size_reg: float):
    chosen_features = features[0: int(round(num_features))]
    auc = cross_validation(
        clf=CatBoostClassifier(
            iterations=int(round(iterations)),
            depth=int(round(depth)),
            l2_leaf_reg=l2_leaf_reg,
            model_size_reg=model_size_reg,
            random_state=0,
            thread_count=mp.cpu_count(),
            eval_metric="AUC",
            verbose=False,
        ),
        features=chosen_features,
    )
    return auc

In [None]:
pbounds = {
    "num_features": (1, 10),
    "iterations": (250, 500),
    "depth": (2, 5),
    "l2_leaf_reg": (0.0, 1.0),
    "model_size_reg": (0.0, 1.0),
}

optimizer = BayesianOptimization(
    f=objective,
    pbounds=pbounds,
    random_state=0,
)

optimizer.maximize(n_iter=50)

In [None]:
optimizer.max