In [None]:
# loading a simple library I created that scales, imputes and one-hot encodes the data.

!pip install git+https://github.com/sd274/pipeline_tools.git


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pipeline_tools as pt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
import xgboost as xgb
import optuna

In [None]:
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()

In [None]:
df.describe(include='object')

In [None]:
df.describe(exclude='object')

There are some missing values. We will ignore the categorical ones for now and impute the numerical ones.

In [None]:
df.columns

# Build a Basic Pipe

In [None]:
cat_features = [
    'Location',	
    'WindGustDir',
    'WindDir9am',
    'WindDir3pm',
    'RainToday',
]

num_features = [
    'MinTemp',
    'MaxTemp',
    'Rainfall',
    'Evaporation',
    'Sunshine',
    'WindGustSpeed',
    'WindSpeed9am',
    'WindSpeed3pm',
    'Humidity9am',
    'Humidity3pm',
    'Pressure9am',
    'Pressure3pm'
]

target = 'RainTomorrow'

In [None]:
X = df[cat_features + num_features]
y = pd.get_dummies(df[target])[['Yes']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
pre_pipe = pt.standard_preprocessing_pipe(cat_features=cat_features, num_features=num_features)

pipe = Pipeline([
    ('pre_pipe', pre_pipe),
    ('learn', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

In [None]:
def objective(trial,data=X_train,target=y_train):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
#         'learn__tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'learn__reg_lambda': trial.suggest_loguniform('learn__reg_lambda', 1e-3, 10.0),
        'learn__reg_alpha': trial.suggest_loguniform('learn__reg_alpha', 1e-3, 10.0),
        'learn__colsample_bytree': trial.suggest_categorical('learn__colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'learn__subsample': trial.suggest_categorical('learn__subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learn__learning_rate': trial.suggest_categorical('learn__learning_rate', [0.001, 0.003,0.03,0.3,0.6]),
        'learn__n_estimators': trial.suggest_int("learn__n_estimators", 100, 1000),
        'learn__max_depth': trial.suggest_categorical('learn__max_depth', [5,7,9,11,13,15,17,20]),
        'learn__random_state': trial.suggest_categorical('learn__random_state', [24, 48,2020]),
    }
#     try:
    pipe.set_params(**param)

    pipe.fit(train_x, train_y)

#     preds = pipe.predict_proba(test_x)[:,1]
    preds = pipe.predict(test_x)

    score = metrics.accuracy_score(test_y, preds)

    return score
#     except:
#         return None

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best Score: ', study.best_trial.value)

In [None]:
best_params = study.best_trial.params

best_params

In [None]:
pipe.set_params(**best_params)

pipe.fit(X_train, y_train)

In [None]:
from sklearn import metrics

test_prediction = pipe.predict_proba(X_test)[:,1]
predictions = pipe.predict(X_test)

score = metrics.roc_auc_score(y_test, test_prediction)
accuracy = metrics.accuracy_score(y_test, predictions)

print(f'Area under ROC of Model On Test Set - {score:,.2%}')
print(f'Accuracy - {accuracy:,.2%}')