## Prodigy Infotech

### Author : Rudra Patel


### Data Science
### Task-03

#### Task: Build a decision tree classifier to predict whether a customer will purchase a product or service based on their demographic and behavioral data. Use a dataset such as the Bank Marketing dataset from the UCI Machine Learning Repository.

### DataSet Link : https://archive.ics.uci.edu/dataset/222/bank+marketing

In [1]:
#importing the necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 100)

In [None]:
#Loading the train data and viewing first few rows
train = pd.read_csv("bank_data/test.csv")
train.head()

In [None]:
#Loading the test data and viewing first few rows
test = pd.read_csv("bank_data/test.csv")
test.head()

In [None]:
#Checking the dimension of the training dataset:
print("The total rows in the training dataset is:" ,train.shape[0] ,"\nThe total columns in the training dataset is:" ,train.shape[1])

In [None]:
#Checking the dimension of the testing dataset:
print("The total rows in the test dataset is:" ,test.shape[0] ,"\nThe total columns in the test dataset is:" ,test.shape[1])

In [None]:
#Checking the missing column in the test dataset.
def check_column_similarity(data1, data2):
    if len(data1.columns)==len(data2.columns):
        print('Both train and test has same columns')
    else:
        print("Column length is different.")
        if len(data1.columns) > len(data2.columns):
            print(set(data1.columns)-set(data2.columns))
        else:
            print(set(data2.columns)-set(data1.columns))


In [None]:
check_column_similarity(train, test)

In [None]:
# Checking the type of data and the missing value
train.info()

In [None]:
train.dtypes.value_counts()

In [None]:
#checking for missing values
train.isnull().sum()

In [None]:
# Descriptive Statistical Analysis:
train.describe( include='all')

In [None]:
# Plotting the 'subscribed' frequency
sns.countplot(data=train, x='subscribed')

In [None]:
#Normalizing the frequency table of 'Subscribed' variable
train['subscribed'].value_counts()

In [None]:
#Selecting the non-numerical columns:
print("The non-numerical columns are: ")
data_non_numerical=train.select_dtypes(object)
data_non_numerical

In [None]:
# Create a function to calculate Cramer's V statistic
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Create a matrix of Cramer's V values between variables
columns = data_non_numerical.columns
cramer_matrix = pd.DataFrame(index=columns, columns=columns)
for col1 in columns:
    for col2 in columns:
        cramer_matrix.loc[col1, col2] = cramers_v(data_non_numerical[col1], data_non_numerical[col2])

# Create a heatmap from the Cramer's V matrix
plt.figure(figsize=(12, 8))
sns.heatmap(cramer_matrix.astype(float), annot=True, cmap="PiYG" ,linewidth=.5)

plt.title("Cramer's V Heatmap for Non-Numeric Variables")
plt.show()

In [None]:
# using barplot
for feature in data_non_numerical:
    plt.figure(figsize=(5, 5))
    sns.countplot(x=feature, data=data_non_numerical, palette='pink', hue="subscribed", edgecolor = "black")
    plt.title(f'Bar Plot of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Converting the target variables into 0s and 1s
train['subscribed'].replace('no', 0,inplace=True)
train['subscribed'].replace('yes', 1,inplace=True)
train['subscribed']

In [None]:
#Selecting the numerical columns:
print("The Numerical columns are: ")
data_numerical=train.select_dtypes(np.number)
data_numerical

In [None]:
#  Correlation matrix (for continuous variables)
correlation_matrix = data_numerical.corr()
correlation_matrix

In [None]:
# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap=sns.cubehelix_palette(as_cmap=True), fmt=".2f",linewidth=.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pairplot (for continuous variables)
sns.pairplot(data_numerical, hue="subscribed", diag_kind="hist", corner=True)   

In [None]:
correlation_matrix.hist(figsize=(10, 10), color='blue', grid=True)
plt.show()

In [None]:
target = train['subscribed']
train = train.drop(['subscribed', "ID"], axis=1)

In [None]:
# Create dummy variables for categorical features
for column in train.select_dtypes(include='object'):
    train = pd.get_dummies(train, columns=[column], dtype='int')
train.head()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=12)

In [None]:
#creating an object of logistic regression model
lreg = LogisticRegression()
#fitting the data into the model
lreg.fit(X_train,y_train)

In [None]:
#Making predictions on the validation set
pred = lreg.predict(X_val)
#Calculating the accuracy score
accuracy_score(y_val,pred)

In [None]:
#creating an object of Decision tree
clf = DecisionTreeClassifier(max_depth=4, random_state=42)
#fitting the model
clf.fit(X_train, y_train)

In [None]:
#making predictions on the validation set
predict = clf.predict(X_val)
predict

In [None]:
#Calculating the accuracy
accuracy_score(y_val,predict)

In [None]:
clf = DecisionTreeClassifier()
y_predict = clf.fit(X_train, y_train).predict(X_val)

In [None]:
print(classification_report(y_val, y_predict))

In [None]:
print(confusion_matrix(y_val, y_predict, labels=clf.classes_))

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_val, y_predict, labels=clf.classes_)
ConfusionMatrixDisplay(cm, display_labels=clf.classes_).plot()

In [None]:
fi = pd.DataFrame({"feature":X_train.columns.to_list(), "importance": clf.feature_importances_}).sort_values(by="importance", ascending=False)
sns.barplot(x="importance", y="feature", data=fi.head(20))
plt.title("Top 20 Features")