Data Preprocessing 

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Load the dataset
bm = pd.read_csv("/Users/raghav/Downloads/bank-marketing.csv", sep=";")

# Exclude demographic factors
demographic_factors = ['age', 'job', 'marital', 'education']
bm = bm.drop(columns=demographic_factors)
bm_non_demo = bm
# Convert categorical variables to dummy variables
bm_non_demo = pd.get_dummies(bm_non_demo, drop_first=True)

Model Selection:

In [27]:
# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    results[model_name] = {'Accuracy': accuracy, 'ROC AUC Score': roc_auc}

# Display the results
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Accuracy,ROC AUC Score
Logistic Regression,0.911953,0.935932
Gradient Boosting,0.919722,0.948858
Random Forest,0.908068,0.934904


Model Training:

In [28]:
# Split the data into features and target
X = bm_non_demo.drop(columns=['y_yes'])
y = bm_non_demo['y_yes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Gradient Boosting model
model = GradientBoostingClassifier()
model.fit(X_train_scaled, y_train)

Predicting probability of buying a term deposit:

In [29]:
# Predict probabilities for the entire dataset
bm_non_demo_scaled = scaler.transform(bm_non_demo.drop(columns=['y_yes']))
bm['probability_yes'] = model.predict_proba(bm_non_demo_scaled)[:, 1]

# Display the first few rows of the updated dataset
bm.head()

Unnamed: 0,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,probability_yes
0,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0.007029
1,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0.004793
2,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0.006366
3,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0.004831
4,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0.011873


In [30]:
bm.describe

<bound method NDFrame.describe of        default housing loan    contact month day_of_week  duration  campaign  \
0           no      no   no  telephone   may         mon       261         1   
1      unknown      no   no  telephone   may         mon       149         1   
2           no     yes   no  telephone   may         mon       226         1   
3           no      no   no  telephone   may         mon       151         1   
4           no      no  yes  telephone   may         mon       307         1   
...        ...     ...  ...        ...   ...         ...       ...       ...   
41183       no     yes   no   cellular   nov         fri       334         1   
41184       no      no   no   cellular   nov         fri       383         1   
41185       no     yes   no   cellular   nov         fri       189         2   
41186       no      no   no   cellular   nov         fri       442         1   
41187       no     yes   no   cellular   nov         fri       239         3   

     

In [31]:
bm.to_csv("/Users/raghav/Downloads/bank-marketing-final.csv")