<a href="https://colab.research.google.com/github/srinijalanda93/Predictive_Analystics/blob/main/2448526_lab3_PA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [4]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day_of_week', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome'],
      dtype='object')

In [5]:
y.columns

Index(['y'], dtype='object')

In [None]:
y

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no
...,...
45206,yes
45207,yes
45208,yes
45209,no


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

In [7]:
print("\n Dataset loaded successfully!")
print("Feature shape:", X.shape)
print("Target shape:", y.shape)
print("\nFirst few rows of X:\n", X.head())
print("\nUnique target values:\n", y.value_counts())


 Dataset loaded successfully!
Feature shape: (45211, 16)
Target shape: (45211, 1)

First few rows of X:
    age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married        NaN      no     1506     yes   no   
4   33           NaN   single        NaN      no        1      no   no   

  contact  day_of_week month  duration  campaign  pdays  previous poutcome  
0     NaN            5   may       261         1     -1         0      NaN  
1     NaN            5   may       151         1     -1         0      NaN  
2     NaN            5   may        76         1     -1         0      NaN  
3     NaN            5   may        92         1     -1         0      NaN  
4     NaN            5   may       198         1     -1         

In [8]:
X.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
default,object
balance,int64
housing,object
loan,object
contact,object
day_of_week,int64


# ---------------------------------------------------------
# Step 1: Handle Categorical Variables
# ---------------------------------------------------------
# Some features are categorical — we need to encode them

In [9]:

X_encoded = X.copy()
le = LabelEncoder()

for col in X_encoded.select_dtypes(include=['object']).columns:
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))

# Encode target variable (yes/no → 1/0)
y_encoded = le.fit_transform(y.values.ravel())

print("\nCategorical encoding completed!")
print("Encoded target sample:", y_encoded[:10])


Categorical encoding completed!
Encoded target sample: [0 0 0 0 0 0 0 0 0 0]


# ---------------------------------------------------------
# Step 2: Train-Test Split
# ---------------------------------------------------------

In [10]:

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("\nData split completed:")
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])



Data split completed:
Training samples: 36168
Testing samples: 9043


# Step 3: Feature Scaling

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Model 1 - Logistic Regression

In [12]:

lr = LogisticRegression(max_iter=1000, solver='lbfgs')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

# Step 5: Model 2 - Support Vector Machine (SVM)

In [13]:

svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
y_prob_svm = svm.predict_proba(X_test)[:, 1]


# Step 6: Evaluation Function

In [14]:

def evaluate_model(y_true, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred),
        "ROC-AUC": roc_auc_score(y_true, y_prob)
    }

In [16]:
# ---------------------------------------------------------
# Step 7: Evaluate Models
# ---------------------------------------------------------
lr_results = evaluate_model(y_test, y_pred_lr, y_prob_lr)
svm_results = evaluate_model(y_test, y_pred_svm, y_prob_svm)

results = pd.DataFrame([lr_results, svm_results], index=["Logistic Regression", "SVM"])

print("\n Model Comparison:\n")
print(results)



 Model Comparison:

                     Accuracy  Precision    Recall  F1-Score   ROC-AUC
Logistic Regression  0.897932   0.643923  0.285444  0.395547  0.873396
SVM                  0.897490   0.647191  0.272212  0.383234  0.850110


In [18]:
# ---------------------------------------------------------
# Step 8: Detailed Reports
# ---------------------------------------------------------
print("\n Logistic Regression Report:\n", classification_report(y_test, y_pred_lr))
print("\nSVM Report:\n", classification_report(y_test, y_pred_svm))

print("\n Confusion Matrix - Logistic Regression:\n", confusion_matrix(y_test, y_pred_lr))
print("\n Confusion Matrix - SVM:\n", confusion_matrix(y_test, y_pred_svm))

# ---------------------------------------------------------
# Step 9: Identify the Better Model
# ---------------------------------------------------------
better_model = "Logistic Regression" if lr_results["F1-Score"] > svm_results["F1-Score"] else "SVM"
print(f"\n The better performing model based on F1-Score is: {better_model}")


 Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      7985
           1       0.64      0.29      0.40      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.63      0.67      9043
weighted avg       0.88      0.90      0.88      9043


SVM Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      7985
           1       0.65      0.27      0.38      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.63      0.66      9043
weighted avg       0.88      0.90      0.88      9043


 Confusion Matrix - Logistic Regression:
 [[7818  167]
 [ 756  302]]

 Confusion Matrix - SVM:
 [[7828  157]
 [ 770  288]]

 The better performing model based on F1-Score is: Logistic Regression
