**Creating another model for the dataset**

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
    return df

def calculate_accuracy(TP, TN, FP, FN):
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    return accuracy

def calculate_precision(TP, FP):
    precision = TP / (TP + FP)
    return precision

def calculate_recall(TP, FN):
    recall = TP / (TP + FN)
    return recall

def calculate_f1(precision, recall):
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def calculate_metrics(y_true, y_pred, y_pred_proba):
    TP = sum((y_true == 1) & (y_pred == 1))
    TN = sum((y_true == 0) & (y_pred == 0))
    FP = sum((y_true == 0) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))
    
    accuracy = calculate_accuracy(TP, TN, FP, FN)
    precision = calculate_precision(TP, FP)
    recall = calculate_recall(TP, FN)
    f1 = calculate_f1(precision, recall)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'ROC-AUC: {roc_auc:.2f}')
    
    return accuracy, precision, recall, f1, roc_auc

# Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Remove outliers for numeric features
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
for feature in numeric_features:
    train_df = remove_outliers_iqr(train_df, feature)

# Splitting dataset into features and target variable
X = train_df.drop(columns=['y'])
y = train_df['y'].map({'yes': 1, 'no': 0})  # Convert 'yes'/'no' to 1/0 for training

# Define preprocessing for numeric and categorical features
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

# Build preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Add feature engineering steps like Interaction Terms and Principal Component Analysis
feature_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=20))
])

# Encode the datasets
X_encoded = feature_pipeline.fit_transform(X)
X_test_encoded = feature_pipeline.transform(test_df)

# Handling Imbalanced Data with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_encoded, y)

# Train-Test Split after resampling
X_train, X_valid, y_train, y_valid = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Define the Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=500)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'penalty': ['l1'], # Regularization terms: Lasso, Ridge, ElasticNet, or none. #'l1', 'l2', , 'none'
    'C': [0.1], # Inverse regularization strength.
    'solver': ['saga'], # Optimization algorithms.
    'max_iter': [100, 200, 300, 500, 1000], # Maximum number of iterations.
    # 'l1_ratio': [0.0, 0.2, 0.5, 0.8, 1.0] # ElasticNet mixing ratio (for 'elasticnet' penalty).
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(logistic_model, param_grid, cv=100, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters found: ", grid_search.best_params_)

# Get the best model from grid search
best_logistic_model = grid_search.best_estimator_

# Predict on validation data
y_pred = best_logistic_model.predict(X_valid)
y_pred_proba = best_logistic_model.predict_proba(X_valid)[:, 1]

# Evaluate the model using custom metric functions
_, _, _, _, _ = calculate_metrics(y_valid, y_pred, y_pred_proba)

# Make predictions on the test data
test_pred_proba = best_logistic_model.predict_proba(X_test_encoded)[:, 1]
test_df['y'] = np.where(test_pred_proba > 0.5, 'yes', 'no')

# Save the updated test dataframe to a new CSV file
test_df.to_csv('5014.csv', index=False)

print("Optimized prediction results have been saved to 5014.csv")


Fitting 100 folds for each of 5 candidates, totalling 500 fits
Best parameters found:  {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 0.83
Precision: 0.81
Recall: 0.86
F1 Score: 0.83
ROC-AUC: 0.89
Optimized prediction results have been saved to 5014.csv
