<a href="https://colab.research.google.com/github/sergioGarcia91/ML_Carolina_Bays/blob/main/05_LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this Notebook, 30 logistic regression models will be trained. Since category 0 has a higher number of pixels compared to category 1, **downsampling** will be applied to category 0.  

The process will involve iteratively separating the data from both categories. In each iteration, the number of samples in category 1 will be counted, and the same number of samples from category 0 will be randomly selected to balance the dataset.  

To increase randomness in training, a new **train-test split** will be performed in each iteration, ensuring that the training data for category 1 varies in every cycle.  


# Start

In [None]:
!pip install tables

In [None]:
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import h5py
import multiprocessing
import joblib

from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression

In [None]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load data

In [None]:
path_saveCSV = '/content/drive/MyDrive/UIS/Doctorado_UIS2198589/1_semestre/TopicosAvanzadosGeofisica/FC_CarolinaBais/Dataset_CSV'

df = pd.read_hdf(os.path.join(path_saveCSV, 'TRAIN_CarolinaBays_AOI_01_03.h5'), 'df')

df.head()

In [None]:
# Total of data 103327744
df.info()

# Split and training

In [None]:
num_cores = multiprocessing.cpu_count()
print(f"Number of available cores: {num_cores}")


In [None]:
path_save_models = '/content/drive/MyDrive/UIS/Doctorado_UIS2198589/1_semestre/TopicosAvanzadosGeofisica/FC_CarolinaBais/ML_models/'


In [None]:
print_text = True
print_text_Training = True
verbose_print = True

count_models = 31 # Indicate the model number that will be saved
# If the process was stopped and you want to continue from the previous amount,
# you should specify the number from which you want to start

total_models = count_models + 10

target_score = 0.6 # In the tests, it never exceeded a score of 0.6

count_trial = 1

train_score_list = []
test_score_list = []
models_name_list = []
elapsed_time_list = []
trial_list = []


while count_models < total_models:
  # Start the timer
  start_time = time.time() # Each iteration takes less than 10 minutes

  clear_output(wait=True)

  # Create empty DataFrames for train and test
  df_train = pd.DataFrame()
  df_test = pd.DataFrame()

  # Filter data
  category_data_1 = df[df['y'] == 1].copy().reset_index(drop=True)
  category_data_0 = df[df['y'] == 0].copy().reset_index(drop=True)

  # Calculate 80% for train and 20% for test
  train_size = int(0.8 * len(category_data_1))
  test_size = len(category_data_1) - train_size

  # Select randomly to shuffle the data
  category_data_1 = category_data_1.sample(frac=1).reset_index(drop=True)
  category_data_0 = category_data_0.sample(frac=1).reset_index(drop=True)

  # Split into train and test
  category_train_1 = category_data_1[:train_size]
  category_train_0 = category_data_0[:train_size]
  category_test_1 = category_data_1[train_size:]
  category_test_0 = category_data_0[train_size:]

  category_train = pd.concat([category_train_1, category_train_0], ignore_index=True)
  category_train = category_train.sample(frac=1).reset_index(drop=True)
  category_test = pd.concat([category_test_1, category_test_0], ignore_index=True)
  category_test = category_test.sample(frac=1).reset_index(drop=True)

  if print_text:
    print(f'Train size: {len(category_train_1)*2}')
    print(f'Test size: {len(category_test_1)*2}')
    print('---'*3)

  # Concatenate the data into the corresponding DataFrames
  df_train = pd.concat([df_train, category_train], ignore_index=True)
  df_test = pd.concat([df_test, category_test], ignore_index=True)
  if print_text:
    print(f'DF Train size: {df_train.shape[0]}')
    print(f'DF Test size: {df_test.shape[0]}')
    print('\n')

  # Datos to Train and Test
  X_train = df_train.iloc[:, :-1].to_numpy()
  y_train = df_train['y'].to_numpy()

  X_test = df_test.iloc[:, :-1].to_numpy()
  y_test = df_test['y'].to_numpy()

  if print_text:
    print('Shapes X_train, y_train, X_test, y_test')
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

  # Create the model
  model_LogReg = LogisticRegression(solver='saga',
                                    verbose=verbose_print,
                                    n_jobs=-1) # -1 means using all processors

  # Train the model
  print('---'*10)
  print(f'Trial: {count_trial}')
  model_LogReg.fit(X_train, y_train)

  # End the timer
  end_time = time.time()

  # Calculate the elapsed time
  elapsed_time = end_time - start_time

  train_score = model_LogReg.score(X_train, y_train)
  test_score = model_LogReg.score(X_test, y_test)

  if train_score > target_score:
    if print_text_Training:
      print(f'Train score: {train_score:.4f}')
      print(f'Test score: {test_score:.4f}')
      print(f'Elapsed time: {elapsed_time:.2f} seconds')
      print('\n')

    # Save model
    if count_models < 10:
      Name = f'model_RegLog_00{count_models}.pkl'
    elif count_models < 100:
      Name = f'model_RegLog_0{count_models}.pkl'
    else:
      Name = f'model_RegLog_{count_models}.pkl'

    joblib.dump(model_LogReg, path_save_models + Name)
    print(f'---> Model saved as {Name}')
    print('\n')

    train_score_list.append(train_score)
    test_score_list.append(test_score)
    models_name_list.append(Name)
    elapsed_time_list.append(round(elapsed_time, 2))
    trial_list.append(count_trial)

    count_models += 1

  else:
    print(f'Train score: {train_score:.4f}')
    print(f'Elapsed time: {elapsed_time:.2f} seconds')
    print('No model was generated.')
    print('\n')

  count_trial += 1


# Df models

In [None]:
dict_model = {'Trial': trial_list,
              'Model': models_name_list,
              'Train score': train_score_list,
              'Test score': test_score_list,
              'Elapsed time': elapsed_time_list} # Total time per iteration

df_models = pd.DataFrame(dict_model)

df_models

In [None]:
df_models.describe().round(2)

## Save Df

In [None]:
df_models.to_csv(path_save_models + 'df_30models_LogReg.csv',
                 sep=';',
                 decimal=',',
                 index=False)

# End