In [1]:
#Library
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Machine Learning Modules
import statsmodels.api as sm 
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
#Reading data
original_df = pd.read_excel('marketing_campaign.xlsx')

In [None]:
columns_to_bedropped = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5' , 
                        'AcceptedCmp1' , 'AcceptedCmp2', 'Z_CostContact','Z_Revenue']
df = original_df.drop(columns_to_bedropped, axis=1)

In [None]:
df.head()

# Data Cleaning 

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
plt.hist(df['Income'])

In [None]:
df['Income'].plot.box()

In [None]:
df['Income']

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)
plt.title('Heatmap displaying the relationship between\nthe features of the data',
         fontsize=13)
plt.show()

In [None]:
df['Income'] = df['Income'].fillna(df['Income'].median())

#  Data Preprocessing

In [None]:
# Checking Unique values of married people
df['Marital_Status'].value_counts()  

In [None]:
df['Marital_Status'] = df['Marital_Status'].replace(['Married', 'Together'],'Committed')
df['Marital_Status'] = df['Marital_Status'].replace(['Divorced', 'Widow', 'Alone', 'YOLO', 'Absurd'],'Single')

In [None]:
df['Marital_Status'].value_counts()  

In [None]:
# Checking Unique values of education
df['Education'].value_counts()

In [None]:
#Reducing the number of categories
df['Education'] = df['Education']. replace(['Master','PhD'],'Post Graduate')
df['Education'] = df['Education'].replace(['Graduation','2n Cycle'], 'Under Graduate')

In [None]:
df['Education'].value_counts()

In [None]:
df['Kids'] = df['Kidhome'] + df['Teenhome']
df['Expenses'] = df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntFishProducts'] + df['MntSweetProducts'] + df['MntGoldProds']
df['NumTotalPurchases'] = df['NumWebPurchases'] + df['NumCatalogPurchases'] + df['NumStorePurchases'] + df['NumDealsPurchases']

In [None]:
# Adding a column "Age" in the dataframe

df['Age'] = 2023 - df["Year_Birth"]

In [None]:
# Number of days a customer was engaged with company

# Changing Dt_customer into timestamp format
df['Dt_Customer'] = pd.to_datetime(df.Dt_Customer)
df['first_day'] = '01-01-2023'
df['first_day'] = pd.to_datetime(df.first_day)
df['Days_engaged'] = (df['first_day'] - df['Dt_Customer']).dt.days

In [None]:
# Deleting some column to reduce dimension and complexity of model

col_del = ["ID","Year_Birth","Dt_Customer","first_day",
           "NumWebVisitsMonth", "NumWebPurchases","NumCatalogPurchases","NumStorePurchases","NumDealsPurchases" , 
           "Kidhome", "Teenhome",
           "MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds"]
df=df.drop(columns=col_del,axis=1)
df.head()

# EDA

In [None]:
# How many people responded?
# Count the number of positive and negative responses
positive_responses = df[df['Response'] == 1]['Response'].count()
negative_responses = df[df['Response'] == 0]['Response'].count()

# Create data for the pie chart
responses = [positive_responses, negative_responses]
labels = ['Positive Response', 'Negative Response']
colors = ['green', 'red']

# Create the pie chart
plt.figure(figsize=(6, 6))
plt.pie(responses, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Responses')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Show the pie chart
plt.show()


In [None]:
# Education distribution of the population who responded and those who did not
# Create a DataFrame for positive responses (Response=1) and negative responses (Response=0)
positive_responses = df[df['Response'] == 1]
negative_responses = df[df['Response'] == 0]

# Calculate the education distribution for both groups
education_distribution_positive = positive_responses['Education'].value_counts()
education_distribution_negative = negative_responses['Education'].value_counts()

# Get unique education levels for the x-axis
education_levels = df['Education'].unique()

# Create subplots for side-by-side comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart for education distribution among positive responses
ax1.bar(education_levels, education_distribution_positive[education_levels], color='green' )
ax1.set_xlabel('Education Level')
ax1.set_ylabel('Count')
ax1.set_title('Education Distribution for Positive Responses')
ax1.legend()

# Bar chart for education distribution among negative responses
ax2.bar(education_levels, education_distribution_negative[education_levels], color='red')
ax2.set_xlabel('Education Level')
ax2.set_ylabel('Count')
ax2.set_title('Education Distribution for Negative Responses')
ax2.legend()

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
# Maritial Status distribution of the population who responded and those who did not

# Calculate the marital status distribution for both groups
marital_status_distribution_positive = positive_responses['Marital_Status'].value_counts()
marital_status_distribution_negative = negative_responses['Marital_Status'].value_counts()

# Get unique marital status categories for the x-axis
marital_status_categories = df['Marital_Status'].unique()

# Create subplots for side-by-side comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart for marital status distribution among positive responses
ax1.bar(marital_status_categories, marital_status_distribution_positive[marital_status_categories], color='green', label='Positive Response')
ax1.set_xlabel('Marital Status')
ax1.set_ylabel('Count')
ax1.set_title('Marital Status Distribution for Positive Responses')
ax1.legend()

# Bar chart for marital status distribution among negative responses
ax2.bar(marital_status_categories, marital_status_distribution_negative[marital_status_categories], color='red', label='Negative Response')
ax2.set_xlabel('Marital Status')
ax2.set_ylabel('Count')
ax2.set_title('Marital Status Distribution for Negative Responses')
ax2.legend()

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
# Distribution of number of kids
# Create subplots for side-by-side comparison
plt.figure(figsize=(14, 6))

# Subplot for positive responses
plt.subplot(1, 2, 1)
plt.hist(positive_responses['Kids'], bins=range(7), alpha=0.5, color='green', label='Positive Response')
plt.xlabel('Number of Kids')
plt.ylabel('Count')
plt.title('Distribution of Number of Kids (Positive Responses)')
plt.xticks(range(6))
plt.legend()

# Plot histograms for the number of kids for negative responses
plt.subplot(1, 2, 2)
plt.hist(negative_responses['Kids'], bins=range(7), alpha=0.5, color='red', label='Negative Response')
plt.xlabel('Number of Kids')
plt.ylabel('Count')
plt.title('Distribution of Number of Kids (Negative Responses)')
plt.xticks(range(6))
plt.legend()

# Adjust layout
plt.tight_layout()

# Show the histograms
plt.show()

In [None]:
#Total expenses
# Create subplots for side-by-side comparison
plt.figure(figsize=(14, 6))

# Subplot for positive responses
plt.subplot(1, 2, 1)
plt.hist(positive_responses['Expenses'], bins=20, color='green', alpha=0.7, label='Positive Response')
plt.xlabel('Total Expenses')
plt.ylabel('Count')
plt.title('Distribution of Total Expenses (Positive Response)')
plt.legend()
plt.grid(True)

# Subplot for negative responses
plt.subplot(1, 2, 2)
plt.hist(negative_responses['Expenses'], bins=20, color='red', alpha=0.7, label='Negative Response')
plt.xlabel('Total Expenses')
plt.ylabel('Count')
plt.title('Distribution of Total Expenses (Negative Response)')
plt.legend()
plt.grid(True)

# Adjust layout
plt.tight_layout()

# Show the subplots
plt.show()

In [None]:
#Total Expenses
# Create subplots for side-by-side comparison
plt.figure(figsize=(14, 6))

# Subplot for positive responses
plt.subplot(1, 2, 1)
plt.hist(positive_responses['NumTotalPurchases'], bins=20, color='green', alpha=0.7, label='Positive Response')
plt.xlabel('Total Purchases')
plt.ylabel('Count')
plt.title('Distribution of Total Purchases (Positive Response)')
plt.legend()
plt.grid(True)

# Subplot for negative responses
plt.subplot(1, 2, 2)
plt.hist(negative_responses['NumTotalPurchases'], bins=20, color='red', alpha=0.7, label='Negative Response')
plt.xlabel('Total Purchases')
plt.ylabel('Count')
plt.title('Distribution of Total Purchases (Negative Response)')
plt.legend()
plt.grid(True)

# Adjust layout
plt.tight_layout()

# Show the subplots
plt.show()

In [None]:
# Distribution of Age
# Create subplots for side-by-side comparison
plt.figure(figsize=(14, 6))

# Subplot for positive responses
plt.subplot(1, 2, 1)
plt.hist(positive_responses['Age'], bins=20, color='green', alpha=0.7, label='Positive Response')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Distribution of Age (Positive Response)')
plt.legend()
plt.grid(True)

# Subplot for negative responses
plt.subplot(1, 2, 2)
plt.hist(negative_responses['Age'], bins=20, color='red', alpha=0.7, label='Negative Response')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Distribution of Age (Negative Response)')
plt.legend()
plt.grid(True)

# Adjust layout
plt.tight_layout()

# Show the subplots
plt.show()

In [None]:
# Number of days
# Create subplots for side-by-side comparison
plt.figure(figsize=(14, 6))

# Subplot for positive responses
plt.subplot(1, 2, 1)
plt.hist(positive_responses['Days_engaged'], bins=20, color='green', alpha=0.7, label='Positive Response')
plt.xlabel('Number of Days Engaged')
plt.ylabel('Count')
plt.title('Distribution of Days Engaged (Positive Response)')
plt.legend()
plt.grid(True)

# Subplot for negative responses
plt.subplot(1, 2, 2)
plt.hist(negative_responses['Days_engaged'], bins=20, color='red', alpha=0.7, label='Negative Response')
plt.xlabel('Number of Days Engaged')
plt.ylabel('Count')
plt.title('Distribution of Days Engaged (Negative Response)')
plt.legend()
plt.grid(True)

# Adjust layout
plt.tight_layout()

# Show the subplots
plt.show()


# Label Encoding and Standardization

In [None]:
# Initialising a list with object data types
obj = []
for i in df.columns:
    if (df[i].dtypes == "object"):
        obj.append(i)

In [None]:
# Label encoding of object encoding
lbl_encode = LabelEncoder()
for i in obj:
    df[i]=df[[i]].apply(lbl_encode.fit_transform)

In [None]:
# Standardization
scaled_features = StandardScaler().fit_transform(df.values)
scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)

# Test-Train dataset split 

In [None]:
y = df[['Response']]

In [None]:
x = df.drop(columns=['Response'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

# Logistic Regression

In [None]:
logisticRegression_model = LogisticRegression(solver = 'liblinear', random_state=0).fit(x_train,y_train)

In [None]:
logisticRegression_model.score(x_train,y_train)

In [None]:
# Logistic regression using stats.api
model = sm.Logit(y_train, x_train)
result = model.fit()

In [None]:
print(result.summary())

In [None]:
logistic_regression_true_labels = y_test  # Replace with your true labels
logistic_regression_predicted_labels = logisticRegression_model.predict(x_test)  # Replace with your predicted labels

# Create a confusion matrix for Logistic Regression
logistic_regression_conf_matrix = confusion_matrix(logistic_regression_true_labels, logistic_regression_predicted_labels)

# Plot the confusion matrix for Logistic Regression as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(logistic_regression_conf_matrix, annot=True, fmt='d',
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

# SVM model

In [None]:
svm_model = svm.SVC(kernel='linear')
svm_final = svm_model.fit(x_train,y_train)

In [None]:
svm_final.score(x_train,y_train)

In [None]:
svm_true_labels = y_test  # Replace with your true labels
svm_predicted_labels = svm_final.predict(x_test)  # Replace with your predicted labels

# Create a confusion matrix for SVM
svm_conf_matrix = confusion_matrix(svm_true_labels, svm_predicted_labels)

# Plot the confusion matrix for SVM as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(svm_conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for SVM')
plt.show()