## A sample of using the classifier

### 1. Importing the modules and setting parameters

In [46]:
# Custom utilities for working with weather data
import utils_weather as we

In [47]:
import pandas as pd

In [48]:
import category_encoders as ce

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# List of classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Metrics
import sklearn.metrics as metric
from sklearn.metrics import make_scorer, fbeta_score

rain_score = make_scorer(fbeta_score, beta=2, pos_label=1)

In [49]:
we.get_search()

dict_keys(['logreg-0', 'logreg-1', 'logreg-2', 'knn-0', 'knn-1', 'knn-2', 'knn-3', 'svc-0', 'tree-0', 'tree-1', 'tree-2', 'forest-0', 'forest-1', 'forest-2', 'forest-3', 'mpl-0', 'mpl-1'])

In [50]:
we.get_search("logreg-2")

{'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
 'logreg__penalty': ['l2'],
 'logreg__solver': ['liblinear', 'newton-cg', 'lbfgs']}

In [51]:
models = {
    'logreg': LogisticRegression(max_iter=10000),
    'mpl': MLPClassifier(max_iter=400),
    'knn': KNeighborsClassifier(),
    'svc': SVC(),
    'tree': DecisionTreeClassifier(),
    'forest': RandomForestClassifier()
}

SCALER = StandardScaler()
ENCODER = OneHotEncoder()

MODEL_TYPE = "logreg"
MODEL = models.get(MODEL_TYPE)

SEARCH_PARAM = we.get_search("logreg-0")
CV_PARAM = 3  # GridSearchCV(cv=)
VB_PARAM = 4  # GridSearchCV(verbose=)

In [92]:
MODEL

LogisticRegression(max_iter=10000)

In [52]:
print("Columns selection:", we.get_columns())

Columns selection: ['origin', 'small', 'big', 'var-1', 'var-2']


In [53]:
COLUMNS = 'big'
print(we.get_columns(COLUMNS))

['Date', 'Location', 'MinTemp', 'MaxTemp', 'Diff_Temp', 'Temp9am', 'Temp3pm', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir_Change', 'WindSpeed_Diff', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Humidity_Diff', 'Pressure9am', 'Pressure3pm', 'Pressure_Diff', 'Cloud9am', 'Cloud3pm', 'RainToday', 'RainTomorrow']


In [54]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# FutureWarning: is_categorical is deprecated and will be removed
# in a future version.  Use is_categorical_dtype instead

### 2. Loading raw data, cleaning and splitting it

In [55]:
raw_data = pd.read_csv('data/current_data.csv')

In [56]:
raw_data.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No


In [57]:
raw_data['RainTomorrow'].value_counts(normalize=True)

No     0.776878
Yes    0.223122
Name: RainTomorrow, dtype: float64

In [58]:
raw_data['RainTomorrow'].isna().sum()

2684

In [59]:
raw_data = raw_data.dropna(subset=['RainTomorrow'])
raw_data['RainTomorrow'].isna().sum()

0

In [60]:
X, y = we.get_data(raw_data, columns=COLUMNS, target='RainTomorrow')

In [61]:
print(X.shape)
print(y.shape)

(116219, 27)
(116219,)


In [62]:
X.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Diff_Temp,Temp9am,Temp3pm,Rainfall,Evaporation,Sunshine,...,WindSpeed3pm,Humidity9am,Humidity3pm,Humidity_Diff,Pressure9am,Pressure3pm,Pressure_Diff,Cloud9am,Cloud3pm,RainToday
0,December,Albury,13.4,22.9,9.5,16.9,21.8,0.6,,,...,24.0,71.0,22.0,49.0,1007.7,1007.1,0.6,8.0,,0.0
1,December,Albury,7.4,25.1,17.7,17.2,24.3,0.0,,,...,22.0,44.0,25.0,19.0,1010.6,1007.8,2.8,,,0.0
2,December,Albury,12.9,25.7,12.8,21.0,23.2,0.0,,,...,26.0,38.0,30.0,8.0,1007.6,1008.7,1.1,,2.0,0.0
3,December,Albury,9.2,28.0,18.8,18.1,26.5,0.0,,,...,9.0,45.0,16.0,29.0,1017.6,1012.8,4.8,,,0.0
4,December,Albury,17.5,32.3,14.8,17.8,29.7,1.0,,,...,20.0,82.0,33.0,49.0,1010.8,1006.0,4.8,7.0,8.0,0.0


In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [64]:
print(X_train.shape)
print(X_test.shape)

(87164, 27)
(29055, 27)


### 3. Features preprocessing

In [65]:
bin_features, num_features, cat_features = we.get_3group_features(X_train)
print(bin_features)
print(num_features)
print(cat_features)

['WindDir_Change', 'RainToday']
['MinTemp', 'MaxTemp', 'Diff_Temp', 'Temp9am', 'Temp3pm', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed_Diff', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Humidity_Diff', 'Pressure9am', 'Pressure3pm', 'Pressure_Diff', 'Cloud9am', 'Cloud3pm']
['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']


In [66]:
# num_features.remove('Rainfall')
# num_features.remove('Evaporation')
# num_features.remove('Sunshine')
# cat_features.remove('Location')

In [67]:
binary_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
])

numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', SCALER)
])

categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', ENCODER)
])

In [68]:
preprocessor = ColumnTransformer(transformers=[
        ('bin', binary_transformer, bin_features),
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
])

In [69]:
model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        (MODEL_TYPE, MODEL)
])

### 4. Target preprocessing

In [70]:
print(y_train.shape)
print(y_test.shape)

(87164,)
(29055,)


In [71]:
y_train.unique()

array([1, 0])

In [72]:
target_imputer = SimpleImputer(strategy='most_frequent')

In [73]:
target_imputer.fit(y_train.values.reshape(-1, 1))

SimpleImputer(strategy='most_frequent')

In [74]:
y_train = target_imputer.transform(
                    y_train.values.reshape(-1, 1)
                ).ravel()
y_test = target_imputer.transform(
                    y_test.values.reshape(-1, 1)
                ).ravel()

### 5. Modelling and search optimal hyperparameters

In [75]:
search = GridSearchCV(estimator=model,
                      param_grid=SEARCH_PARAM,
                      cv=CV_PARAM, scoring='recall',
                      verbose=VB_PARAM)
search.fit(X_train, y_train);

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] logreg__C=0.001 .................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..................... logreg__C=0.001, score=0.463, total=   4.5s
[CV] logreg__C=0.001 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.6s remaining:    0.0s


[CV] ..................... logreg__C=0.001, score=0.475, total=   4.3s
[CV] logreg__C=0.001 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.9s remaining:    0.0s


[CV] ..................... logreg__C=0.001, score=0.470, total=   4.3s
[CV] logreg__C=0.01 ..................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.2s remaining:    0.0s


[CV] ...................... logreg__C=0.01, score=0.503, total=   4.9s
[CV] logreg__C=0.01 ..................................................
[CV] ...................... logreg__C=0.01, score=0.512, total=   5.4s
[CV] logreg__C=0.01 ..................................................
[CV] ...................... logreg__C=0.01, score=0.512, total=   5.1s
[CV] logreg__C=0.1 ...................................................
[CV] ....................... logreg__C=0.1, score=0.516, total=   6.2s
[CV] logreg__C=0.1 ...................................................
[CV] ....................... logreg__C=0.1, score=0.522, total=   6.4s
[CV] logreg__C=0.1 ...................................................
[CV] ....................... logreg__C=0.1, score=0.522, total=   6.8s
[CV] logreg__C=1 .....................................................
[CV] ......................... logreg__C=1, score=0.517, total=   7.2s
[CV] logreg__C=1 .....................................................
[CV] .

[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:  2.1min finished


In [76]:
print("Best parameter (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.5209):
{'logreg__C': 100}


In [77]:
print("Best parameter (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.5209):
{'logreg__C': 100}


In [78]:
predict = search.predict(X_test)

In [79]:
metric.recall_score(y_test, predict)

0.5149022042843837

In [80]:
metric.recall_score(y_test, predict, average=None)

array([0.9457834, 0.5149022])

In [81]:
metric.recall_score(y_test, predict, average='micro')

0.8502495267595939

In [82]:
metric.recall_score(y_test, predict, average='macro')

0.7303428016071014

In [83]:
metric.recall_score(y_test, predict, average='weighted')

0.8502495267595939

In [84]:
metric.roc_auc_score(y_test, predict)

0.7303428016071014

In [85]:
metric.accuracy_score(y_test, predict)

0.8502495267595939

In [86]:
metric.confusion_matrix(y_test, predict)

array([[21387,  1226],
       [ 3125,  3317]])

In [87]:
# Diagonal Recall values True
# Normalizes confusion matrix over the true (rows)
metric.confusion_matrix(y_test, predict, normalize='true')

array([[0.9457834, 0.0542166],
       [0.4850978, 0.5149022]])

In [88]:
# Diagonal Precision values True
# Normalizes confusion matrix over the predicted (columns) conditions
metric.confusion_matrix(y_test, predict, normalize='pred')

array([[0.87251142, 0.26986573],
       [0.12748858, 0.73013427]])

In [89]:
# Normalizes confusion matrix over the all the population
metric.confusion_matrix(y_test, predict, normalize='all')

array([[0.73608673, 0.04219584],
       [0.10755464, 0.11416279]])

In [90]:
print(metric.classification_report(y_test, predict, digits=3))

              precision    recall  f1-score   support

           0      0.873     0.946     0.908     22613
           1      0.730     0.515     0.604      6442

    accuracy                          0.850     29055
   macro avg      0.801     0.730     0.756     29055
weighted avg      0.841     0.850     0.840     29055



In [91]:
pd.DataFrame(search.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6
mean_fit_time,4.22264,4.95594,6.28939,7.48872,5.73439,5.64622,5.78246
std_fit_time,0.10964,0.172348,0.277972,0.39979,0.0416081,0.119843,0.228506
mean_score_time,0.170682,0.18507,0.191922,0.180141,0.170839,0.171218,0.173711
std_score_time,0.000838397,0.0113325,0.0202982,0.00706408,0.00123434,0.000802355,0.0046868
param_logreg__C,0.001,0.01,0.1,1,10,100,1000
params,{'logreg__C': 0.001},{'logreg__C': 0.01},{'logreg__C': 0.1},{'logreg__C': 1},{'logreg__C': 10},{'logreg__C': 100},{'logreg__C': 1000}
split0_test_score,0.462675,0.503001,0.51593,0.516546,0.516546,0.516546,0.516546
split1_test_score,0.475062,0.512315,0.522321,0.522475,0.522321,0.522783,0.522629
split2_test_score,0.46952,0.512007,0.521706,0.522629,0.523245,0.523245,0.523245
mean_test_score,0.469085,0.509108,0.519986,0.52055,0.520704,0.520858,0.520807
