# Logistic Regression

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from logistic_regression import MultinomialLogisticRegression
from preprocess import Preprocess
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [4]:
# Paths of the raw data
# Plase changhe this in order to execute the code on your machine
raw_train_data = "../data/train.csv"
raw_test_data = "../data/test.csv"

In [70]:
# Preprocessing
preproc = Preprocess()
raw_data = preproc.load_data(raw_train_data)

# Selecting data from each label
raw_data_0 = raw_data[raw_data["Label"]== 0]
raw_data_1 = raw_data[raw_data["Label"]== 1]
raw_data_2 = raw_data[raw_data["Label"]== 2]

# Split train and test
train_df, train_data = preproc.preprocess_data(raw_data, drop_cols=["SNo", "time"])
np.random.shuffle(train_data)
X_train, y_train, X_valid, y_valid = preproc.train_valid_split(
    train_data, test_size=0.33#, random_state=42
)

# Check columns
print(train_df.columns)

# Normalize data
X_train = preproc.normalize_data(X_train)
X_valid = preproc.normalize_data(X_valid)

Index(['lat', 'lon', 'TMQ', 'U850', 'V850', 'UBOT', 'VBOT', 'QREFHT', 'PS',
       'PSL', 'T200', 'T500', 'PRECT', 'TS', 'TREFHT', 'Z1000', 'Z200', 'ZBOT',
       'month', 'Label'],
      dtype='object')


In [71]:
# Grid search
regularizers = np.array([0.001, 0.01, 0.05, 0.1])
learning_rates = np.array([0.01, 0.1])

# Create all combinations
params_combination = np.array(
    np.meshgrid(regularizers, learning_rates)
).T.reshape(-1, 2)

# Lists to store results
valid_errors = []
valid_errors_idx = []

# Grid search
for reg, l_rate in params_combination:
    # fit model
    model = MultinomialLogisticRegression(learning_rate=l_rate, num_iterations=500, regularizer=reg)
    model.fit(X_train, y_train, collist=raw_data.columns, valid_x=X_valid, valid_y=y_valid)
    
    # Save min error index
    min_error_idx = np.argmin(model.valid_errors)
    valid_errors_idx.append(min_error_idx)
    
    # Save min error
    min_error = np.min(model.valid_errors)
    valid_errors.append(min_error)

Epoch 0, cross entropy loss: 1.0986122886681096
Epoch 100, cross entropy loss: 0.993644483162475
Epoch 200, cross entropy loss: 0.9573295595468179
Epoch 300, cross entropy loss: 0.9290054333613618
Epoch 400, cross entropy loss: 0.9046738293644808
Epoch 0, cross entropy loss: 1.0986122886681096
Epoch 100, cross entropy loss: 0.8128941512959614
Epoch 200, cross entropy loss: 0.7594128543499958
Epoch 300, cross entropy loss: 0.7377492921865354
Epoch 400, cross entropy loss: 0.7246261212758425
Epoch 0, cross entropy loss: 1.0986122886681096
Epoch 100, cross entropy loss: 0.9939041136820228
Epoch 200, cross entropy loss: 0.9579163742845913
Epoch 300, cross entropy loss: 0.9299144087559139
Epoch 400, cross entropy loss: 0.9058872591352831
Epoch 0, cross entropy loss: 1.0986122886681096
Epoch 100, cross entropy loss: 0.8155826213540521
Epoch 200, cross entropy loss: 0.7637242609954311
Epoch 300, cross entropy loss: 0.7433273375947094
Epoch 400, cross entropy loss: 0.7313705234806887
Epoch 0, 

Validation errors and indices

In [75]:
(valid_errors)

[0.889627447746625,
 0.723274806487763,
 0.8911010637220999,
 0.7310484515582987,
 0.8976504680575431,
 0.7655979852051232,
 0.9058372234768473,
 0.8050143776937151]

In [33]:
valid_errors_idx

[49, 5, 49, 5, 42, 4, 33, 3]

Which were the best parameters?

In [34]:
opt_params = params_combination[np.argmin(valid_errors)]
opt_params

array([0.001, 0.1  ])

Which was the best iteration (it's divided by 10)

In [76]:
ite = valid_errors_idx[np.argmin(valid_errors)]
ite

49

In [77]:
# Fit the model
model = MultinomialLogisticRegression(learning_rate=0.001, num_iterations=1000, regularizer=opt_params[1])
model.fit(X_train, y_train, collist=raw_data.columns, valid_x=X_valid, valid_y=y_valid)

# Make predictions.
predictions = model.predict(X_valid)
print(train_df.columns)

model.get_metrics(y_valid, predictions, return_values=False)

Epoch 0, cross entropy loss: 1.0986122886681096
Epoch 100, cross entropy loss: 1.0725466486156243
Epoch 200, cross entropy loss: 1.0546504317767231
Epoch 300, cross entropy loss: 1.0416111896344893
Epoch 400, cross entropy loss: 1.0315783859446133
Epoch 500, cross entropy loss: 1.0234864746815464
Epoch 600, cross entropy loss: 1.0167005667694677
Epoch 700, cross entropy loss: 1.0108287643332818
Epoch 800, cross entropy loss: 1.0056204618415454
Epoch 900, cross entropy loss: 1.0009097622238567
Index(['lat', 'lon', 'TMQ', 'U850', 'V850', 'UBOT', 'VBOT', 'QREFHT', 'PS',
       'PSL', 'T200', 'T500', 'PRECT', 'TS', 'TREFHT', 'Z1000', 'Z200', 'ZBOT',
       'month', 'Label'],
      dtype='object')
Confusion Matrix:
[[6590 2864 2145]
 [ 137  473    8]
 [ 671  491 1392]] 

Accuracy:
0.5724 

Precision:
0.469 

Recall:
0.6262 

F1 Score:
0.5363


This is a bad accuracy. I will explain more in the report.