In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report,confusion_matrix
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from prettytable import PrettyTable 
from sklearn.preprocessing import StandardScaler

In [99]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'Decision Tree':DecisionTreeClassifier()
}

In [100]:
df = pd.read_csv('train.csv')

In [101]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [102]:
df['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [103]:
df['Geography'].value_counts()

Geography
France     94215
Spain      36213
Germany    34606
Name: count, dtype: int64

In [104]:
def country_encoding(data):
    if data == "France":
        return 1
    elif data == "Spain":
        return 2
    else:
        return 3
# 1 france, 2 spain, 3 germany

In [105]:
df['Geography'] = df['Geography'].apply(country_encoding)

In [106]:
df['Gender'].value_counts()

Gender
Male      93150
Female    71884
Name: count, dtype: int64

In [107]:
def gender(data):
    if data == "Male":
        return 1
    elif data == "Female":
        return 2
# 1 male, 2 female

In [108]:
df['Gender'] = df['Gender'].apply(gender)

In [109]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,1,1,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,1,1,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,1,1,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,1,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,2,1,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [110]:
dropped_cols = ['Surname','Exited']

In [111]:
X = df.drop(columns=dropped_cols)
y = df['Exited']

In [112]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=466)

In [113]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(x_train)

In [114]:
X_test_scaled = ss.transform(x_test)

In [115]:
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:,1]

    cr = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Create a PrettyTable for the classification report
    report_table = PrettyTable()
    report_table.field_names = ['Class', 'Precision', 'Recall', 'F1-Score', 'Support']
    for class_name, metrics in cr.items():
        if class_name.isdigit():  # Skip non-numeric class names (if any)
            report_table.add_row([class_name,
                                  metrics['precision'],
                                  metrics['recall'],
                                  metrics['f1-score'],
                                  metrics['support']])

    # Print results
    print(f"Model: {model_name}")
    print(f"Confusion matrix:\n{cm}")
    print("-" * 40)
    print("Classification report:")
    print(report_table)
    print("-" * 40)
    print(f"ROC AUC: {roc_auc:.2f}")
    print(f"PR AUC: {pr_auc:.2f}")
    print("-" * 40)
    print(f"Accuracy :{accuracy}")
    print(f"Precision :{precision}")
    print(f"recall :{recall}")
    print("-" * 40)

Model: Logistic Regression
Confusion matrix:
[[24858  1147]
 [ 4317  2685]]
----------------------------------------
Classification report:
+-------+--------------------+--------------------+--------------------+---------+
| Class |     Precision      |       Recall       |      F1-Score      | Support |
+-------+--------------------+--------------------+--------------------+---------+
|   0   | 0.8520308483290489 | 0.9558930974812536 | 0.9009786154403769 |  26005  |
|   1   | 0.7006784968684759 | 0.3834618680377035 | 0.4956618054273584 |   7002  |
+-------+--------------------+--------------------+--------------------+---------+
----------------------------------------
ROC AUC: 0.81
PR AUC: 0.59
----------------------------------------
Accuracy :0.8344593571060684
Precision :0.7006784968684759
recall :0.3834618680377035
----------------------------------------
Model: Random Forest
Confusion matrix:
[[24690  1315]
 [ 3234  3768]]
----------------------------------------
Classification 

In [125]:
gb = models['Gradient Boosting']

In [116]:
df_test = pd.read_csv('test.csv')

In [117]:
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [118]:
df_test['Geography'] = df_test['Geography'].apply(country_encoding)

In [119]:
df_test['Gender'] = df_test['Gender'].apply(gender)

In [120]:
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,1,2,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,1,2,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,1,2,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,1,1,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,3,1,38.0,10,121263.62,1,1.0,0.0,139431.0


In [122]:
X = df_test.drop(columns=['Surname'])

In [123]:
X_scaled = ss.transform(X)

In [127]:
y_prob = gb.predict_proba(X_scaled)[:,1]

In [128]:
len(y_prob)

110023

In [129]:
df_test.shape

(110023, 13)

In [131]:
data = {'id': df_test['id'],
        'Exited': y_prob}  # Replace with your actual Exited values

# Create DataFrame
df = pd.DataFrame(data)

# Save as CSV
df.to_csv('sample_submission.csv', index=False)