Kaggle Dataset: Customer Segmentation

https://www.kaggle.com/datasets/vishakhdapat/customer-segmentation-clustering/data

Take "Response" column as label, and train a Logistic Regression model to predict the marketing campaign result.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [4]:
# Load the data from pre-processed files
data = pd.read_csv("data/processed_data.csv")

print(data.shape)

(2213, 34)


In [5]:
# Split the data into features and target variable
X = data[['Age_Group', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Customer_Tenure_Days', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain']]
y = data['Response']

In [6]:
# Perform one-hot encoding for categorical features
categorical_features = ['Age_Group', 'Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain']
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
print(X.shape)
X.head()

(2213, 38)


Unnamed: 0,Income,Customer_Tenure_Days,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,...,Kidhome_1,Kidhome_2,Teenhome_1,Teenhome_2,AcceptedCmp1_1,AcceptedCmp2_1,AcceptedCmp3_1,AcceptedCmp4_1,AcceptedCmp5_1,Complain_1
0,58138.0,4632,58,635,88,546,172,88,88,3,...,False,False,False,False,False,False,False,False,False,False
1,46344.0,4082,38,11,1,6,2,1,6,2,...,True,False,True,False,False,False,False,False,False,False
2,71613.0,4281,26,426,49,127,111,21,42,1,...,False,False,False,False,False,False,False,False,False,False
3,26646.0,4108,26,11,4,20,10,3,5,2,...,True,False,False,False,False,False,False,False,False,False
4,58293.0,4130,94,173,43,118,46,27,15,5,...,True,False,False,False,False,False,False,False,False,False


In [7]:
# Split the data into training and testing sets, such that 90% of the data is used for training and 10% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=77)

In [8]:
# Perform scaling on numerical features
scaler = StandardScaler()

numerical_features = ['Income', 'Customer_Tenure_Days', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

X_train.head(5)

Unnamed: 0,Income,Customer_Tenure_Days,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,...,Kidhome_1,Kidhome_2,Teenhome_1,Teenhome_2,AcceptedCmp1_1,AcceptedCmp2_1,AcceptedCmp3_1,AcceptedCmp4_1,AcceptedCmp5_1,Complain_1
1902,1.523898,1.302501,1.201733,3.028223,0.018367,-0.211308,0.615491,2.00159,-0.32065,-1.215036,...,False,False,False,False,True,True,False,True,True,False
622,0.430963,1.411784,0.132877,1.964116,1.071045,0.499794,0.304405,0.351856,0.493383,1.397423,...,False,False,True,False,False,False,False,False,False,False
341,0.638523,1.039228,-1.487646,0.766624,0.945726,2.748654,0.853381,2.494048,0.086366,-0.692545,...,False,False,False,False,False,False,False,False,False,False
1832,0.782168,1.257794,0.44319,0.058207,0.670025,3.255314,-0.061579,0.327233,0.183275,-0.692545,...,False,False,False,False,False,False,False,False,False,False
1792,1.07649,-1.757427,0.787983,0.799229,-0.457845,0.290908,-0.262871,0.868937,-0.514468,-0.692545,...,False,False,False,False,True,False,False,False,False,False


In [9]:
# Use sklearn's LogisticRegression model

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Test Accuracy: {np.mean(y_pred == y_test) * 100:.2f}%")

Test Accuracy: 90.99%
