In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [None]:
data = pd.read_csv('./data/data.csv')
X = data.drop('Value', axis=1)  # Features
y = data['Value']  # Target variable

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        # Impute and encode categorical variables
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols),
        # Impute and scale numerical variables
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean for numerical columns
            ('scaler', StandardScaler())  # Optionally scale numeric features
        ]), numerical_cols)
    ]
)

In [None]:
# Create a pipeline for Logistic Regression
pipeline_logistic = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Create a pipeline for Random Forest
pipeline_forest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])


In [None]:
# Train the models
pipeline_logistic.fit(X_train, y_train)
pipeline_forest.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Hyperparameter tuning for Random Forest using Grid Search
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=pipeline_forest, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_params = grid_search.best_params_
print("Best parameters for Random Forest:", best_params)




Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}


In [None]:
from sklearn.preprocessing import LabelBinarizer

# Evaluation metrics for Logistic Regression
y_pred_logistic = pipeline_logistic.predict(X_test)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
precision_logistic = precision_score(y_test, y_pred_logistic, average='weighted', zero_division=0)  # Use zero_division=0 to avoid warning
recall_logistic = recall_score(y_test, y_pred_logistic, average='weighted', zero_division=0)        # Use zero_division=0 to avoid warning
f1_logistic = f1_score(y_test, y_pred_logistic, average='weighted')

# Handle the ROC AUC for multiclass
lb = LabelBinarizer()
y_test_binarized = lb.fit_transform(y_test)

if len(lb.classes_) == pipeline_logistic.predict_proba(X_test).shape[1]:
    roc_auc_logistic = roc_auc_score(y_test_binarized, pipeline_logistic.predict_proba(X_test), multi_class='ovr')
else:
    print("Warning: Number of classes in y_test does not match the number of predicted probabilities.")
    roc_auc_logistic = None  # Or handle it appropriately

print("Logistic Regression - Accuracy:", accuracy_logistic)
print("Logistic Regression - Precision:", precision_logistic)
print("Logistic Regression - Recall:", recall_logistic)
print("Logistic Regression - F1 Score:", f1_logistic)
print("Logistic Regression - ROC AUC:", roc_auc_logistic)

Logistic Regression - Accuracy: 0.4108131119625372
Logistic Regression - Precision: 0.31505340089050643
Logistic Regression - Recall: 0.4108131119625372
Logistic Regression - F1 Score: 0.3175312354782198
Logistic Regression - ROC AUC: None


In [None]:
# Evaluation metrics for Random Forest
y_pred_forest = pipeline_forest.predict(X_test)

accuracy_forest = accuracy_score(y_test, y_pred_forest)
precision_forest = precision_score(y_test, y_pred_forest, average='weighted', zero_division=0)
recall_forest = recall_score(y_test, y_pred_forest, average='weighted', zero_division=0)
f1_forest = f1_score(y_test, y_pred_forest, average='weighted')

# Handle the ROC AUC for Random Forest
if len(lb.classes_) == pipeline_forest.predict_proba(X_test).shape[1]:
    roc_auc_forest = roc_auc_score(y_test_binarized, pipeline_forest.predict_proba(X_test), multi_class='ovr')
else:
    print("Warning: Number of classes in y_test does not match the number of predicted probabilities.")
    roc_auc_forest = None  # Or handle it appropriately

print("Random Forest - Accuracy:", accuracy_forest)
print("Random Forest - Precision:", precision_forest)
print("Random Forest - Recall:", recall_forest)
print("Random Forest - F1 Score:", f1_forest)
print("Random Forest - ROC AUC:", roc_auc_forest)

Random Forest - Accuracy: 0.6926351638995317
Random Forest - Precision: 0.6402643862049653
Random Forest - Recall: 0.6926351638995317
Random Forest - F1 Score: 0.6346633688279795
Random Forest - ROC AUC: None
