In [None]:
#Importing required libraries
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
#Reading training and testing data as dataframes
train_df = pd.read_csv("/kaggle/input/banking-dataset-marketing-targets/train.csv",sep =";")
test_df = pd.read_csv("/kaggle/input/banking-dataset-marketing-targets/test.csv", sep =";")
train_df.head()

# Data Profiling

In [None]:
#Dimensions of dataset
train_df.shape

In [None]:
#Information about the data types of features
train_df.info()

In [None]:
#Statistical summary of training dataset
train_df.describe(include ='all')

In [None]:
#Checking if there are any missing values
train_df.isnull().sum()

# Analysis

In [None]:
sns.countplot(x="education",data=train_df, hue = "y")
plt.title("Education type vs Count")

In [None]:
sns.countplot(x="marital", data = train_df)
plt.title("Martial Status vs Count")

* It seems like more number of married people invested in Term deposits

In [None]:
sns.countplot(x="job", data = train_df, hue ="y")
plt.title("Job vs Count")
plt.xticks(rotation=90)

* It seems like more number of people working in management profiles have subscribed to term deposits

In [None]:
sns.countplot(x="loan", data = train_df, hue ="y")
plt.title("personal loan vs Count")

In [None]:
sns.countplot(x="housing", data = train_df, hue ="y")
plt.title("housing loan vs Count")

In [None]:
sns.countplot(x="contact", data = train_df, hue ="y")
plt.title("Contact vs Count")

In [None]:
correlation_matrix = train_df.corr()
sns.heatmap(correlation_matrix, annot =True)

* There is no multicollinearity between independent variables

# Feature Encoding

In [None]:
#Combining training and testing data for the purpose of encoding
df = pd.concat([train_df,test_df], ignore_index=True)
df.shape

In [None]:
df = pd.get_dummies(df,columns = ['job','marital','education','default','housing','month','loan','contact','poutcome'], drop_first = True)
df.head()

In [None]:
df['y'].replace('yes', 1, inplace=True)
df['y'].replace('no', 0, inplace=True)
df.head()

In [None]:
target = df['y']
df = df.drop('y',axis = 1)
columns = df.columns
scaler = MinMaxScaler()
df = scaler.fit_transform(df)
df = pd.DataFrame(df,columns=[columns])
df.head()

In [None]:
y = np.array(target)
X = df

#Splitting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 20)

Predicting whether the client will subscribe to Term deposit or not

In [None]:
#Initializing and fitting the logistic regression model
lr_model = LogisticRegression(max_iter=125)
lr_model.fit(X_train,y_train)
y_pred = lr_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

The accuracy of the logistic regression model is 90%

Lets try with Support Vector Machine

In [None]:
clf = SVC(kernel = 'linear')
clf.fit(X_train, y_train)
y_pred_svc = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_svc))

In [None]:
#Checkig for imbalances in the classes
y = np.bincount(y_train)
i = np.nonzero(y)[0]
np.vstack((i,y[i])).T

The accuracy of the SVC model is 89%. But in both logistic regression and SVC, the recall value for class 1 is very less. Its due to imbalances in the classes. To overcome this issue, lets apply SMOTE technique to the training dataset.

# SMOTE for oversampling minority class

In [None]:
#Applying SMOTE on the trainingg data
sm = SMOTE(random_state = 12)
X_train_smote, y_train_smote = sm.fit_sample(X_train,y_train)

In [None]:
#Checking for imbalances in the training data
y = np.bincount(y_train_smote)
i = np.nonzero(y)[0]
np.vstack((i,y[i])).T

After applyting SMOTE, the class distribution is balanced

In [None]:
#Logistic regression on new oversampled data
lr_model = LogisticRegression(max_iter = 200)
lr_model.fit(X_train_smote,y_train_smote)
y_pred_smote = lr_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_smote))

The accuracy of the logistic regression model is 84%. And also the recall value is higher for both the classes after applying SMOTE tecnique