In [17]:
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from fastapi import FastAPI

In [3]:
# Load data
data = pd.read_csv('D://data_1D.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
0,0,83,retired,divorced,basic.4y,no,no,no,cellular,nov,tue,242.0,1,3,3,success,yes
1,1,32,services,married,high.school,no,no,no,telephone,may,tue,190.0,3,999,0,nonexistent,no
2,2,31,admin.,single,university.degree,no,yes,no,telephone,may,wed,325.0,2,999,0,nonexistent,no
3,3,33,admin.,single,university.degree,no,yes,no,telephone,aug,thu,158.0,1,999,0,nonexistent,no
4,4,39,services,married,high.school,no,yes,no,telephone,jul,mon,158.0,1,999,0,nonexistent,no


In [4]:
app = FastAPI()

In [5]:
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8238 entries, 0 to 8237
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   8238 non-null   int64  
 1   age          8238 non-null   int64  
 2   job          8230 non-null   object 
 3   marital      8238 non-null   object 
 4   education    8238 non-null   object 
 5   default      8238 non-null   object 
 6   housing      8238 non-null   object 
 7   loan         8238 non-null   object 
 8   contact      8238 non-null   object 
 9   month        8238 non-null   object 
 10  day_of_week  8238 non-null   object 
 11  duration     8226 non-null   float64
 12  campaign     8238 non-null   int64  
 13  pdays        8238 non-null   int64  
 14  previous     8238 non-null   int64  
 15  poutcome     8238 non-null   object 
 16  y            8238 non-null   object 
dtypes: float64(1), int64(5), object(11)
memory usage: 1.1+ MB


(8238, 17)

In [6]:
data.isnull().sum()

Unnamed: 0      0
age             0
job             8
marital         0
education       0
default         0
housing         0
loan            0
contact         0
month           0
day_of_week     0
duration       12
campaign        0
pdays           0
previous        0
poutcome        0
y               0
dtype: int64

In [8]:
filler = data['duration'].mean()
mode_value = data['job'].mode()[0]

data['job'].fillna(mode_value, inplace=True)
data['duration'].fillna(filler, inplace=True)


data.isnull().sum()

Unnamed: 0     0
age            0
job            0
marital        0
education      0
default        0
housing        0
loan           0
contact        0
month          0
day_of_week    0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
dtype: int64

In [9]:
data.duplicated().values.sum()

0

In [10]:
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])

#Data splitting
X = data.drop(['y'], axis=1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (6590, 15) (6590,)
Test set: (1648, 15) (1648,)


In [11]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

In [12]:
#Data Scaling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [18]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

#Data Encoding
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [19]:
#Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [20]:
#Training model random forest and logistic regression
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier(random_state=42))])

lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42))])

rf_pipeline.fit(X_train, y_train)
lr_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'duration',
                                                   'campaign', 'pdays',
                                                   'previous']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['job', 'marital',
                                             

In [21]:
#Model evaluation
rf_pred = rf_pipeline.predict(X_test)
lr_pred = lr_pipeline.predict(X_test)

rf_report = classification_report(y_test, rf_pred)
lr_report = classification_report(y_test, lr_pred)

print("Random Forest Classification Report:\n", rf_report)
print("Logistic Regression Classification Report:\n", lr_report)

Random Forest Classification Report:
               precision    recall  f1-score   support

          no       0.91      0.98      0.94      1435
         yes       0.72      0.32      0.45       213

    accuracy                           0.90      1648
   macro avg       0.81      0.65      0.69      1648
weighted avg       0.88      0.90      0.88      1648

Logistic Regression Classification Report:
               precision    recall  f1-score   support

          no       0.91      0.98      0.95      1435
         yes       0.75      0.35      0.47       213

    accuracy                           0.90      1648
   macro avg       0.83      0.66      0.71      1648
weighted avg       0.89      0.90      0.88      1648



In [22]:
#Best model
best_model = rf_pipeline if rf_report > lr_report else lr_pipeline

#Save pickle
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)