In [2]:
# Importing libraries
import pandas as pd
import numpy as np

# Load the CSV file with a semicolon delimiter
df = pd.read_csv(r"C:\Users\Ulvi Karimli\Desktop\Reproducible research project\bank-additional-full.csv", delimiter=";")

# Print the number of samples and basic dataframe information
print('Number of samples:', len(df))
df.info()

# Print the first few rows of the dataframe
df.head()

# Count the number of rows for each output type
df.groupby('y').size()

# Create a label for output
df['OUTPUT_LABEL'] = (df.y == 'yes').astype('int')

# Function to calculate the prevalence of the positive class
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

# Print prevalence of the positive class
print('Prevalence of the positive class: %.3f' % calc_prevalence(df['OUTPUT_LABEL'].values))

# Basic statistical analysis
df.describe()

# Check and handle missing values if any (illustrative; assumes no missing values as per initial check)
# df.isnull().sum()  # Uncomment to check for missing values



Number of samples: 41188
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.c

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,OUTPUT_LABEL
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911,0.112654
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528,0.316173
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,0.0
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,0.0
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0,0.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1,0.0
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1,1.0


In [3]:
# Importing libraries
import pandas as pd

# Data has already been loaded by Ramin
# Feature Engineering
df['age'] = df['age'].fillna(df['age'].mean())  # Example of filling missing values

# Handling categorical data using one-hot encoding
cols_cat = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
df = pd.get_dummies(df, columns=cols_cat, drop_first=True)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols_num = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'nr.employed', 'euribor3m']
df[cols_num] = scaler.fit_transform(df[cols_num])

# Output the first few rows to verify transformations
df.head()



Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,1.533034,261,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
1,1.628993,149,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
2,-0.290186,226,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
3,-0.002309,151,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
4,1.533034,307,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0


In [4]:
# Importing libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score

# Splitting data into train, validation, and test sets
X = df.drop(['y', 'OUTPUT_LABEL'], axis=1)
y = df['OUTPUT_LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Training a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Making predictions and evaluating the model
y_pred_train = model.predict_proba(X_train)[:, 1]
y_pred_valid = model.predict_proba(X_valid)[:, 1]
print('Training AUC: %.3f' % roc_auc_score(y_train, y_pred_train))
print('Validation AUC: %.3f' % roc_auc_score(y_valid, y_pred_valid))



Training AUC: 1.000
Validation AUC: 0.937


In [5]:
# Importing libraries
import pickle
from sklearn.model_selection import GridSearchCV

# Load model training section by Salim
# Advanced Model Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
cv = GridSearchCV(model, param_grid, scoring='roc_auc', cv=3)
cv.fit(X_train, y_train)

# Selecting the best model and evaluating it
best_model = cv.best_estimator_
y_pred_test = best_model.predict_proba(X_test)[:, 1]
print('Test AUC: %.3f' % roc_auc_score(y_test, y_pred_test))

# Saving the model for deployment
filename = 'finalized_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print('Loaded model score:', result)



Test AUC: 0.945
Loaded model score: 0.9155943999352594
