## A sample of using the classifier

### 1. Importing the modules and setting parameters

In [130]:
# Custom utilities for working with weather data
import utils_weather as we

In [131]:
import pandas as pd

In [132]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import sklearn.metrics as metric

# List of classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [133]:
we.get_search()

dict_keys(['logistic-0', 'logistic-1', 'logistic-2'])

In [134]:
we.get_search("logistic-0")

{'logistic__C': [0.001, 0.1, 1, 10, 1000]}

In [135]:
SCALER = StandardScaler()
ENCODER = OneHotEncoder()
MODEL = LogisticRegression(max_iter=10000)
MODEL_TYPE = "logistic"

SEARCH_PARAM = we.get_search("logistic-0")
CV_PARAM = 3  # GridSearchCV(cv=)
VB_PARAM = 1  # GridSearchCV(verbose=)

In [136]:
print("Columns selection:", we.get_columns())

Columns selection: ['origin', 'small', 'big', 'var-1']


In [137]:
COLUMNS = 'origin'
print(we.get_columns(COLUMNS))

['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']


In [138]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# FutureWarning: is_categorical is deprecated and will be removed
# in a future version.  Use is_categorical_dtype instead

### 2. Loading raw data, cleaning and splitting it

In [139]:
raw_data = pd.read_csv('data/current_data.csv')

In [140]:
raw_data.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No


In [141]:
raw_data['RainTomorrow'].value_counts(normalize=True)

No     0.776878
Yes    0.223122
Name: RainTomorrow, dtype: float64

In [142]:
raw_data['RainTomorrow'].isna().sum()

2684

In [143]:
raw_data = raw_data.dropna(subset=['RainTomorrow'])
raw_data['RainTomorrow'].isna().sum()

0

In [144]:
X, y = we.get_data(raw_data, columns=COLUMNS, target='RainTomorrow')

In [145]:
print(X.shape)
print(y.shape)

(116219, 22)
(116219,)


In [146]:
X.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,December,Albury,13.4,22.9,0.6,,,W,44.0,W,...,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0
1,December,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,0.0
2,December,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0
3,December,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,0.0
4,December,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0


In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [148]:
print(X_train.shape)
print(X_test.shape)

(87164, 22)
(29055, 22)


### 3. Features preprocessing

In [149]:
bin_features, num_features, cat_features = we.get_3group_features(X_train)
print(bin_features)
print(num_features)
print(cat_features)

['RainToday']
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']


In [150]:
binary_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
])

numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', SCALER)
])

categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', ENCODER)
])

In [151]:
preprocessor = ColumnTransformer(transformers=[
        ('bin', binary_transformer, bin_features),
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
])

In [152]:
model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        (MODEL_TYPE, MODEL)
])

### 4. Target preprocessing

In [153]:
print(y_train.shape)
print(y_test.shape)

(87164,)
(29055,)


In [154]:
y_train.unique()

array([0, 1])

In [155]:
target_imputer = SimpleImputer(strategy='most_frequent')

In [156]:
target_imputer.fit(y_train.values.reshape(-1, 1))

SimpleImputer(strategy='most_frequent')

In [157]:
y_train = target_imputer.transform(
                    y_train.values.reshape(-1, 1)
                ).ravel()
y_test = target_imputer.transform(
                    y_test.values.reshape(-1, 1)
                ).ravel()

### 5. Modelling and search optimal hyperparameters

In [158]:
search = GridSearchCV(estimator=model,
                      param_grid=SEARCH_PARAM,
                      cv=CV_PARAM,
                      verbose=VB_PARAM)
search.fit(X_train, y_train);

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.4min finished


In [159]:
print("Best parameter (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.8484):
{'logistic__C': 1}


In [160]:
search.score(X_test, y_test)

0.8524866632249183

In [161]:
predict = search.predict(X_test)

In [162]:
metric.roc_auc_score(y_test, predict)

0.7339311014491384

In [163]:
metric.accuracy_score(y_test, predict)

0.8524866632249183

In [164]:
metric.confusion_matrix(y_test, predict)

array([[21442,  1252],
       [ 3034,  3327]])

In [165]:
# Diagonal Recall values True
# Normalizes confusion matrix over the true (rows)
metric.confusion_matrix(y_test, predict, normalize='true')

array([[0.94483123, 0.05516877],
       [0.47696903, 0.52303097]])

In [166]:
# Diagonal Precision values True
# Normalizes confusion matrix over the predicted (columns) conditions
metric.confusion_matrix(y_test, predict, normalize='pred')

array([[0.87604184, 0.27342214],
       [0.12395816, 0.72657786]])

In [167]:
# Normalizes confusion matrix over the all the population
metric.confusion_matrix(y_test, predict, normalize='all')

array([[0.73797969, 0.04309069],
       [0.10442265, 0.11450697]])

In [168]:
print(metric.classification_report(y_test, predict, digits=3))

              precision    recall  f1-score   support

           0      0.876     0.945     0.909     22694
           1      0.727     0.523     0.608      6361

    accuracy                          0.852     29055
   macro avg      0.801     0.734     0.759     29055
weighted avg      0.843     0.852     0.843     29055



In [128]:
pd.DataFrame(search.cv_results_).T

Unnamed: 0,0,1,2,3,4
mean_fit_time,4.46785,6.08313,7.30541,6.10447,5.85484
std_fit_time,0.45768,0.110953,0.286353,0.113344,0.301119
mean_score_time,0.165739,0.166667,0.160303,0.171726,0.162896
std_score_time,0.00808359,0.0107676,0.00151264,0.0109783,0.00348058
param_logistic__C,0.001,0.1,1,10,1000
params,{'logistic__C': 0.001},{'logistic__C': 0.1},{'logistic__C': 1},{'logistic__C': 10},{'logistic__C': 1000}
split0_test_score,0.843848,0.850628,0.850387,0.850284,0.850353
split1_test_score,0.844502,0.849664,0.849596,0.849596,0.84963
split2_test_score,0.839609,0.847732,0.847904,0.848076,0.848007
mean_test_score,0.842653,0.849341,0.849296,0.849319,0.84933
