# Logistic Regression

In [105]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE

### Data Preprocessing

In [126]:
# Load data
#train = pd.read_csv(f"dataset_original.csv")
#train = pd.read_csv(f"dataset_160k.csv")
train = pd.read_csv(f"dataset_30k.csv")

# shuffle the dataset
train = train.sample(frac=1).reset_index(drop=True)

In [127]:
# PreferedOrderCat
train.loc[train["PreferedOrderCat"] == "Laptop & Accessory", "PreferedOrderCat"] = "Laptop_Accessory"
train.loc[train["PreferedOrderCat"] == "Mobile Phone", "PreferedOrderCat"] = "Mobile"

#PreferredPaymentMode
train.loc[train["PreferredPaymentMode"] == "Debit Card", "PreferredPaymentMode"] = "DebitCard"
train.loc[train["PreferredPaymentMode"] == "Credit Card", "PreferredPaymentMode"] = "CreditCard"
train.loc[train["PreferredPaymentMode"] == "CC", "PreferredPaymentMode"] = "CreditCard"
train.loc[train["PreferredPaymentMode"] == "E wallet", "PreferredPaymentMode"] = "Ewallet"
train.loc[train["PreferredPaymentMode"] == "Cash on Delivery", "PreferredPaymentMode"] = "COD"

#PreferredLoginDevice
train.loc[train["PreferredLoginDevice"] == "Mobile Phone", "PreferredLoginDevice"] = "Mobile"
train.loc[train["PreferredLoginDevice"] == "Phone", "PreferredLoginDevice"] = "Mobile"

In [128]:
# Drop the 'CustomerID' column since it's not useful for prediction
X = train.drop('CustomerID', axis=1)

# Separate the target variable from the rest of the dataset
y = train['Churn']

X = X.drop(columns=['Churn'], axis=1)
#X = X.drop(columns=[], axis=1)

# Perform one-hot encoding on the categorical features
cat_cols = ['PreferredLoginDevice', 'PreferredPaymentMode', 'PreferedOrderCat','Gender','MaritalStatus']
X = pd.get_dummies(X, columns=cat_cols)

# Fill missing values with mean
#X = X.fillna(0)
#X.fillna(X.mode().iloc[0], inplace=True)
X.fillna(X.mean(), inplace=True)
# X.fillna(X.median(), inplace=True)
# X.fillna(method='ffill', inplace=True)
# X.fillna(method='bfill', inplace=True)
#X.interpolate(method='linear', inplace=True)

#y = y.fillna(0)
#y.fillna(y.mode().iloc[0], inplace=True)
y.fillna(y.mean(), inplace=True)
# y.fillna(y.median(), inplace=True)
# y.fillna(method='ffill', inplace=True)
# y.fillna(method='bfill', inplace=True)
#y.interpolate(method='linear', inplace=True)

In [129]:
num_cols = X.columns.tolist()
for col in cat_cols:
    if col in num_cols:
        num_cols.remove(col)

# Normalize the numerical features using min-max scaling
X[num_cols] = (X[num_cols] - X[num_cols].min()) / (X[num_cols].max() - X[num_cols].min())

# Convert the target variable to binary (0 or 1)
y = y.astype(int)

print(X.shape)
print(y.shape)


(30630, 30)
(30630,)


In [130]:
def apply_smote(X, y, random_state=None):
    """
    Applies SMOTE to the input features (X) and target variable (y) to balance the dataset.
    
    Parameters:
    X: numpy array or pandas DataFrame with the input features
    y: numpy array or pandas Series with the target variable
    random_state: int, default=None, controls the randomness of the SMOTE algorithm
    
    Returns:
    X_resampled: numpy array with the resampled input features
    y_resampled: numpy array with the resampled target variable
    """
    smote = SMOTE(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [131]:
X, y = apply_smote(X, y, random_state=42)
print(X.shape)
print(y.shape)

(50312, 30)
(50312,)


In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

y_test = np.array(y_test).reshape(-1, 1)

(40249, 30) (40249,) (10063, 30) (10063,)


### The model

In [141]:
# Define hyperparameters
learning_rate = 0.001
epochs = 10000

patience = 5
best_val_loss = float('inf')
patience_counter = 0


In [134]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)


    def forward(self, x):
        out = self.linear(x)
        out = torch.sigmoid(out)
        return out
 
    def loss_function(self, y_pred, y_true):
        loss = F.binary_cross_entropy(y_pred, y_true)

        return loss


In [135]:
model = LogisticRegression(X_train.shape[1])

In [136]:
# Define the loss function
criterion = nn.BCELoss()

# Define the optimizer
# Set the learning rate and weight decay (L2 regularization) for the Adam optimizer
learning_rate = 0.001
weight_decay = 0.001

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)



#### Train the model

In [137]:
y_train.shape
type(y_train)

pandas.core.series.Series

In [138]:
train_data = X_train.values
train_labels = y_train.values


In [142]:
# Train the model
for epoch in range(epochs):
    inputs = torch.from_numpy(train_data).float()
    labels = torch.from_numpy(train_labels).float().view(-1, 1)

    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    # Backward pass and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}: Loss={loss.item():.4f}")

print(f"Epoch {epoch}: Loss={loss.item():.4f}")

Epoch 0: Loss=0.5813
Epoch 1000: Loss=0.5688
Epoch 2000: Loss=0.5588
Epoch 3000: Loss=0.5510
Epoch 4000: Loss=0.5449
Epoch 5000: Loss=0.5406
Epoch 6000: Loss=0.5378
Epoch 7000: Loss=0.5364
Epoch 8000: Loss=0.5360
Epoch 9000: Loss=0.5359
Epoch 9999: Loss=0.5359


### Evaluate

In [119]:
test_data = X_test.values

In [120]:
# Evaluate the model on the test set
inputs = torch.from_numpy(test_data).float()

In [121]:
labels = torch.from_numpy(y_test).float().view(-1, 1)

In [122]:
outputs = model(inputs)

In [123]:
predicted = outputs.round().detach().numpy()

In [124]:
accuracy = np.mean(predicted == y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8650


### F1 Score

In [125]:
from sklearn.metrics import confusion_matrix, f1_score

# Convert the predictions to binary labels (0 or 1)
y_pred_binary = (predicted > 0.5).astype(int)

# calculate confusion matrix
confusion = confusion_matrix(y_test, y_pred_binary)

# calculate precision and recall
precision = confusion[1, 1] / (confusion[1, 1] + confusion[0, 1])
recall = confusion[1, 1] / (confusion[1, 1] + confusion[1, 0])

# calculate F1 score
f1 = 2 * (precision * recall) / (precision + recall)

print(confusion[1,1])
print(confusion[0,1])
print(f'Precision: { precision}, Recall: {recall}')
print('F1 Score:', f1)

38
4
Precision: 0.9047619047619048, Recall: 0.34545454545454546
F1 Score: 0.5
