In [1]:
import load_data

import random
import joblib
import warnings

import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from sklearn.model_selection import ParameterGrid
from sklearn.exceptions import ConvergenceWarning

In [2]:
data = load_data.read_data_sets()

# get train data
train_x = data.train.data

# get train labels
train_labels = data.train.labels

# get test data
test_x = data.test.data

# get test labels
test_labels = data.test.labels

# get sample number
n_samples = data.train.num_examples

# Print the first 10 examples of training data and labels
print("First 10 examples of training data:")
print(train_x[:10])
print()

print("Corresponding labels for the training data:")
print(train_labels[:10])
print()

# Print the first 10 examples of test data and labels
print("First 10 examples of test data:")
print(test_x[:10])
print()

print("Corresponding labels for the test data:")
print(test_labels[:10])
print()

# Print the total number of samples in the dataset
print(f"Total number of samples in the dataset: {n_samples}")

First 10 examples of training data:
[[26.94933313 26.94942352 26.86116345 ... 16.22032493 17.53295088
  17.53075379]
 [26.95540644 26.95561349 26.86818211 ... 16.22077216 17.53405894
  17.53186146]
 [26.96394591 26.96354065 26.87468671 ... 16.22162589 17.53570343
  17.53347345]
 ...
 [26.95091461 26.95146416 26.85930528 ... 16.21242109 17.53291899
  17.53075099]
 [26.93989902 26.94184343 26.84918537 ... 16.21115102 17.5320002
  17.52995518]
 [26.92817907 26.9306008  26.83702679 ... 16.21079389 17.53139138
  17.52946743]]

Corresponding labels for the training data:
[2 2 2 2 2 2 2 2 2 2]

First 10 examples of test data:
[[26.88684078 26.36006069 26.2375236  ... 16.32757215 16.90493887
  16.7821357 ]
 [26.88612478 26.35698842 26.23444401 ... 16.33011158 16.90430262
  16.78066285]
 [26.88125864 26.3487917  26.22622124 ... 16.33296166 16.90424512
  16.77954978]
 ...
 [26.83513111 26.27179449 26.16101443 ... 16.31236921 16.87338631
  16.74132307]
 [26.82404048 26.25446281 26.14994268 ... 16

In [3]:
# Count the unique labels in the training set
unique_train, counts_train = np.unique(train_labels, return_counts=True)
label_counts_train = dict(zip(unique_train, counts_train))

# Count the unique labels in the test set
unique_test, counts_test = np.unique(test_labels, return_counts=True)
label_counts_test = dict(zip(unique_test, counts_test))

print("Training label counts:", label_counts_train)
print("Test label counts:", label_counts_test)

Training label counts: {0: 28602, 1: 26628, 2: 29190}
Test label counts: {0: 18438, 1: 19740, 2: 19950}


In [4]:
# Check if dataset has missing values
missing_rows_count = np.isnan(train_x).any(axis=1).sum()
print(f"Number of rows with missing values: {missing_rows_count}")

Number of rows with missing values: 0


In [5]:
# Balance the training data using SMOTE
smote = SMOTE(random_state=42)
balanced_train_x, balanced_train_labels = smote.fit_resample(train_x, train_labels)

unique_train1, counts_train1 = np.unique(balanced_train_labels, return_counts=True)
label_counts_train1 = dict(zip(unique_train1, counts_train1))

print("Training label counts after balancing:", label_counts_train1)

Training label counts after balancing: {0: 29190, 1: 29190, 2: 29190}


In [6]:
# Shuffle indices
indices = np.arange(balanced_train_x.shape[0])
np.random.shuffle(indices)

# Use shuffled indices to shuffle train_x and train_labels
balanced_train_x_shuffled = balanced_train_x[indices]
balanced_train_labels_shuffled = balanced_train_labels[indices]

# Print the first 10 examples of shuffled training labels
print("First 10 shuffled training labels")
print(balanced_train_labels_shuffled[:10])

First 10 shuffled training labels
[0 0 1 2 0 2 0 1 0 1]


In [7]:
balanced_train_labels_shuffled_encoded = to_categorical(balanced_train_labels_shuffled, num_classes=3)
test_labels_encoded = to_categorical(test_labels, num_classes=3)

print(balanced_train_labels_shuffled_encoded[:10])
print(test_labels_encoded[:10])

[[1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [8]:
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(balanced_train_x_shuffled)
X_test = scaler.transform(test_x)

y_train = balanced_train_labels_shuffled_encoded
y_test = test_labels_encoded

y_train1 = balanced_train_labels_shuffled
y_test1 = test_labels

print("First 10 examples of training data:")
print(X_train[:10])
print()

print("Corresponding labels for the training data:")
print(y_train[:10])
print()

print("One-Hot encoded labels for the training data:")
print(y_train1[:10])
print()

print("First 10 examples of test data:")
print(X_test[:10])
print()

print("Corresponding labels for the test data:")
print(y_test[:10])
print()

print("One-Hot encoded labels for the test data:")
print(y_test1[:10])
print()

First 10 examples of training data:
[[-1.56955821 -1.44485175 -1.29187886 ... -0.61953812 -0.58817583
  -0.64833758]
 [ 1.18939037  1.72235096  1.34105149 ...  0.72793209  0.34534574
   0.17853108]
 [ 0.74646566  0.44659276  0.54034448 ...  0.60687866 -0.10396648
   0.06618927]
 ...
 [-0.88103756 -1.25887047 -1.35970375 ... -0.86265608 -0.81529286
  -1.00670976]
 [-1.08720664 -0.70336561 -1.21834653 ... -0.9841089  -0.93170511
   0.02157167]
 [ 0.24347842  0.04448774  0.32732339 ... -0.83747358 -1.07470651
   5.99647619]]

Corresponding labels for the training data:
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]

One-Hot encoded labels for the training data:
[0 0 1 2 0 2 0 1 0 1]

First 10 examples of test data:
[[ 0.44138796 -0.00202854 -0.1746563  ... -0.41053994 -0.00116352
  -0.22655811]
 [ 0.4407388  -0.00470152 -0.17757255 ... -0.40779565 -0.0018294
  -0.22787341]
 [ 0.43632692 -0.01183294 -0.18535919 ... -

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Define the parameter grid for logistic regression
param_grid_lr = {
    'penalty': ['l1', 'l2','none'],
    'C': [0.01, 0.1, 1, 10, 100],
    #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 500, 1000]
}

# Initialize the logistic regression model
log_reg = LogisticRegression(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid_lr, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the accuracy of each model in the grid search
print("Grid search results:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print(f"{mean:.4f} (+/-{std * 2:.4f}) for {params}")

# Print the best parameters found by the grid search
print("Best parameters found by grid search:")
print(grid_search.best_params_)

# Use the best estimator found by grid search to make predictions
best_log_reg = grid_search.best_estimator_
y_pred_log_reg = best_log_reg.predict(X_val)

# Print classification report and accuracy
print("Logistic Regression with Best Parameters:")
print(classification_report(y_val, y_pred_log_reg))
print(f"Accuracy: {accuracy_score(y_val, y_pred_log_reg):.4f}")

In [None]:
# Set random seeds for reproducibility
np.random.seed(43)
random.seed(43)

# Suppress convergence warnings
warnings.filterwarnings('ignore')

# Define grid search parameters
param_grid_lr = {
    'penalty': ['l1', 'l2', 'none'],
    'C': [0.01, 0.1, 1],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 500, 1000]
}

# Initialize best accuracy and hyperparameters
best_accuracy_lr = 0
best_params_lr = {}
best_model_path = 'Best_LR_model.pkl'

# Loop over parameter combinations
for params in ParameterGrid(param_grid_lr):
    try:
        print(f"Training with params: {params}")
        
        # Create the Logistic Regression model
        model_LR = LogisticRegression(solver=params['solver'], penalty=params['penalty'], C=params['C'], max_iter=params['max_iter'], random_state=43)
        
        # Train the model
        model_LR.fit(X_train, y_train1)
        
        # Evaluate the model
        y_pred = model_LR.predict(X_test)
        accuracy = accuracy_score(y_test1, y_pred)
        print(f"Test accuracy: {accuracy:.4%}")
        
        
        # Check if this model is the best so far
        if accuracy > best_accuracy_lr:
            best_accuracy_lr = accuracy
            best_params_lr = params
            joblib.dump(model_LR, best_model_path)
            print(f"Saved the best model with accuracy: {accuracy:.4%}")
        
    except ValueError as e:
        print(f"Skipping parameters {params} due to error: {e}")
    print()

print(f"Best accuracy of Logistic Regression: {best_accuracy_lr:.4%} with params: {best_params_lr}")

Training with params: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'newton-cg'}
Skipping parameters {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'newton-cg'} due to error: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Training with params: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'lbfgs'}
Skipping parameters {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'lbfgs'} due to error: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Training with params: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Test accuracy: 71.2995%
Saved the best model with accuracy: 71.2995%

Training with params: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'sag'}
Skipping parameters {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'sag'} due to error: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

Training with params: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1