In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [4]:
file_path = './data/dataset_train.csv'  # Update to the correct file path
df = pd.read_csv(file_path)
df.set_index('CENSEOID', inplace=True)
df.drop(['CLIENTID', 'CLIENT'], axis=1, inplace=True)
print(df['V28HCCCODED'].value_counts())
if df['V28HCCCODED'].nunique() <= 1:
    raise ValueError("The dataset contains only one class. Add more diverse samples.")
X = df.drop('V28HCCCODED', axis=1)
y = df['V28HCCCODED']

0    460520
1    100613
Name: V28HCCCODED, dtype: int64


In [5]:
age_group_col = 'MEMBERAGEGROUP'
X[age_group_col] = X[age_group_col].astype(str)
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

In [6]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])


In [8]:
X_processed = preprocessor.fit_transform(X)
input_shape = X_processed.shape[1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
def build_keras_model():
    model = Sequential()
    model.add(Dense(64, input_dim=input_shape, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [11]:
print("\nTraining Keras Model...")
keras_model = KerasClassifier(build_fn=build_keras_model, epochs=10, batch_size=32, verbose=0)

# Evaluate Keras model using cross-validation
scores = cross_val_score(keras_model, X_train, y_train, cv=5, scoring='accuracy', error_score='raise')
accuracy_keras = scores.mean()
print(f'\nKeras Model Accuracy (Cross-Validation): {accuracy_keras:.4f}')


Training Keras Model...


  keras_model = KerasClassifier(build_fn=build_keras_model, epochs=10, batch_size=32, verbose=0)



Keras Model Accuracy (Cross-Validation): 0.8315


In [12]:
class PyTorchNN(nn.Module):
    def __init__(self, input_size):
        super(PyTorchNN, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.dropout(x)
        x = torch.relu(self.layer2(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.output(x))
        return x

In [13]:
# Train PyTorch model
def train_pytorch_model(X_train, y_train, X_test, y_test):
    input_size = X_train.shape[1]
    model = PyTorchNN(input_size)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Train the model
    model.train()
    for epoch in range(10):  # Train for 10 epochs
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor).round()
        accuracy = accuracy_score(y_test_tensor, y_pred)
        print(f'PyTorch Model Accuracy: {accuracy:.4f}')
        print(classification_report(y_test_tensor, y_pred))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Ensure X_train and X_test are DataFrames
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

# Fit and transform the preprocessor
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Convert processed data back to DataFrames
feature_names = preprocessor.get_feature_names_out()
X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Train PyTorch model
print("\nTraining PyTorch Model...")
train_pytorch_model(X_train_df, y_train, X_test_df, y_test)



Training PyTorch Model...
PyTorch Model Accuracy: 0.8311
              precision    recall  f1-score   support

         0.0       0.84      0.99      0.91     92104
         1.0       0.67      0.12      0.20     20123

    accuracy                           0.83    112227
   macro avg       0.75      0.55      0.55    112227
weighted avg       0.81      0.83      0.78    112227



ModuleNotFoundError: No module named 'tensorflow.keras'