In [29]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [14]:
bank_data = pd.read_csv("/workspaces/dev/SVM/bank_data.txt",sep=';', quotechar='"')
bank_data.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [15]:
bank_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

DATA CLEANING AND PRE PROCESSING

In [16]:
# Removing dots inbetween words
bank_data["job"] = bank_data["job"].str.replace(r'\.', '', regex=True)
bank_data["education"] = bank_data["education"].str.replace(r'\.', ' ', regex=True)
bank_data["pdays"] = bank_data["pdays"].replace(999,0)

In [17]:
bank_data['y'] = bank_data['y'].map({'yes': 1,'no': 0})

In [19]:
# Encoding Categorical Data Columns - 10 Columns
categorical_columns = ['job','marital','education', 'default', 'housing', 'loan','contact', 'month', 'day_of_week','poutcome']
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
# Fit and transform categorical columns
encoded_categories = encoder.fit_transform(bank_data[categorical_columns])
# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_columns))

In [20]:
# Create Dataframe for numerical columns
numerical_columns = ['age','duration', 'campaign', 'pdays','previous', 'emp.var.rate', 'cons.price.idx',
                     'cons.conf.idx', 'euribor3m', 'nr.employed']
numerical_df = bank_data[numerical_columns].reset_index(drop=True)

In [7]:
# Combine the Dataframes
combined_df = pd.concat([numerical_df, encoded_df], axis=1)

In [21]:
# Adding the Target column to the combined df
combined_df['y'] = bank_data['y']

TRAINING THE MODEL

In [23]:
x = combined_df.iloc[:,:-1]
y = combined_df.iloc[:,-1]

In [26]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

In [27]:
# Preprocess the data by scaling features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [30]:
# Hyper Parameter Tuning
# Define the parameter grid
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],    # You can add more kernels
    'C': [0.1, 1, 10],                      # Regularization parameter
    'gamma': ['scale', 'auto']              # For RBF and polynomial kernels
}

# Initialize the SVC
svm = SVC()

# Use GridSearchCV to find the best kernel and parameters
grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(x_train, y_train)

# Best parameters and kernel
print("Best parameters:", grid_search.best_params_)

Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [31]:
# Training the model
svc_classifier = SVC(kernel='linear', C=0.1, gamma='scale', random_state=42)
svc_classifier.fit(x_train, y_train)

In [32]:
# Make predictions on the test set
y_pred = svc_classifier.predict(x_test)

In [33]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [34]:
# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10935
           1       1.00      1.00      1.00      1422

    accuracy                           1.00     12357
   macro avg       1.00      1.00      1.00     12357
weighted avg       1.00      1.00      1.00     12357

