In [None]:
!pip install lightautoml

In [None]:
!pip install torch

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV

from scipy.sparse import csr_matrix as csr
from scipy.sparse import hstack

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")

In [None]:
df.head()

In [None]:
df.describe()

# Dataset Preparation

In [None]:
def season_finder(x):
    y = dt.strptime(x, "%Y-%m-%d").month
    if y in [12, 1, 2]:
        return 'summer', y
    elif y in [3, 4, 5]:
        return 'autumn', y
    elif y in [6, 7, 8]:
        return 'winter', y
    else:
        return 'spring', y


df['Season'], df['month'] = zip(*df.Date.apply(season_finder))

In [None]:
new_df = pd.DataFrame(columns=df.columns.tolist())

for month in df.month.unique().tolist():
    new_df = pd.concat([new_df, df[df.month == month].fillna(df[df.month == month].mean())])

check for nan-values

In [None]:
new_df.isna().sum()

In [None]:
new_df.dropna(subset=['RainTomorrow', 'RainToday'], inplace=True)

new_df.sort_values(by='Date', ignore_index=True, inplace=True)

for na_col in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    new_df[na_col].fillna(method='ffill', inplace=True)
    
new_df.set_index('Date', inplace=True)

check that no nan-values are left

In [None]:
new_df.isna().sum()

In [None]:
new_df.replace(['Yes', 'No'], [1, 0], inplace=True)

In [None]:
new_df.head()

In [None]:
ohe_cols = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Season', 'month']

prep_df = new_df[[x for x in new_df.columns if x not in ohe_cols]].copy()

ohe_df = new_df[ohe_cols].copy()

In [None]:
ohe = OHE()

ohe_prep = ohe.fit_transform(ohe_df)

In [None]:
target = new_df["RainTomorrow"]
features = hstack([csr(prep_df.drop("RainTomorrow", axis = 1).values), ohe_prep])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

# Random Forest

In [None]:
rfc = RFC(n_estimators=100, n_jobs=-1)

rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
rf_metrics = pd.DataFrame(data={'metric': ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
                             'score':[metrics.accuracy_score(y_test, y_pred),
                                      metrics.f1_score(y_test, y_pred),
                                      metrics.precision_score(y_test, y_pred),
                                      metrics.recall_score(y_test, y_pred),
                                      metrics.roc_auc_score(y_test, y_pred)]})

result

In [None]:
rf_metrics

roc_auc is decent, but recall is quite low, so we need to continue our search

# Logistic Regression + GridSearch

In [None]:
parameters = {'penalty':('l1', 'l2', 'elasticnet'), 'C':[0.1, 1.0, 10.0, 100.0]}

clf = GridSearchCV(LR(), parameters, n_jobs=-1, scoring='accuracy')
clf.fit(X_train, y_train)

clf.best_params_

In [None]:
lr = LR(C=0.1, n_jobs=-1)
lr.fit(X_train, y_train)

In [None]:
vfunc = np.vectorize(lambda x: 1 if x > 0.35 else 0)

lr_y_pred = vfunc(lr.predict_proba(X_test)[:, 1])

In [None]:
lr_metrics = pd.DataFrame(data={'metric': ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
                             'score':[metrics.accuracy_score(y_test, lr_y_pred),
                                      metrics.f1_score(y_test, lr_y_pred),
                                      metrics.precision_score(y_test, lr_y_pred),
                                      metrics.recall_score(y_test, lr_y_pred),
                                      metrics.roc_auc_score(y_test, lr_y_pred)]})

result

In [None]:
lr_metrics

now recall is much better, but precision dropped from ca. 0.8 to 0.614

# AutoML

In [None]:
import torch

from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.automl.blend import WeightedBlender

In [None]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [None]:
model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

In [None]:
pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=60)

model1 = BoostLGBM(default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS})

model2 = BoostLGBM(default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS})

pipeline_lvl1 = MLPipeline([(model1, params_tuner1),model2],
                           pre_selection=selector,
                           features_pipeline=pipe,
                           post_selection=None)

In [None]:
pipe1 = LGBSimpleFeatures()

model = BoostLGBM(default_params={'learning_rate': 0.05,
                                  'num_leaves': 64,
                                  'max_bin': 1024,
                                  'seed': 3,
                                  'num_threads': N_THREADS},
                  freeze_defaults=True)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

In [None]:
automl = AutoML(reader, [[pipeline_lvl1], [pipeline_lvl2]], skip_conn=False)

turning our sparce matrix back to data frame

In [None]:
n, m = csr(y_train).get_shape()

train_data = hstack([X_train, csr(y_train).reshape(m, n)])

train_df = pd.DataFrame(data=train_data.toarray())

train_df.rename(columns={x:str(x) for x in train_df.columns if x != 130}, inplace=True)
train_df.rename(columns={130: 'target'}, inplace=True)

In [None]:
autopred = automl.fit_predict(train_df, roles={'target': 'target'})

turning our sparce matrix back to data frame (now for test-data)

In [None]:
test_df = pd.DataFrame(data=X_test.toarray())

test_df.rename(columns={x:str(x) for x in test_df.columns}, inplace=True)

In [None]:
pred = automl.predict(test_df)

In [None]:
automl_pred = vfunc(pred.data[:, 0])

In [None]:
automl_metrics = pd.DataFrame(data={'metric': ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
                             'score':[metrics.accuracy_score(y_test, automl_pred),
                                      metrics.f1_score(y_test, automl_pred),
                                      metrics.precision_score(y_test, automl_pred),
                                      metrics.recall_score(y_test, automl_pred),
                                      metrics.roc_auc_score(y_test, automl_pred)]})

result

In [None]:
automl_metrics

* best accuracy from all models
* best f1 from all models
* best roc auc from all models
* best tradeoff between precision and recall