In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
df = pd.read_csv("/kaggle/input/customers-dataset/Customers.csv")
df.head(10)

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6
5,6,Female,22,58000,76,Artist,0,2
6,7,Female,35,31000,6,Healthcare,1,3
7,8,Female,23,84000,94,Healthcare,1,3
8,9,Male,64,97000,3,Engineer,0,3
9,10,Female,30,98000,72,Artist,1,4


In [4]:
df.columns

Index(['CustomerID', 'Gender', 'Age', 'Annual Income ($)',
       'Spending Score (1-100)', 'Profession', 'Work Experience',
       'Family Size'],
      dtype='object')

In [5]:
df.describe

<bound method NDFrame.describe of       CustomerID  Gender  Age  Annual Income ($)  Spending Score (1-100)  \
0              1    Male   19              15000                      39   
1              2    Male   21              35000                      81   
2              3  Female   20              86000                       6   
3              4  Female   23              59000                      77   
4              5  Female   31              38000                      40   
...          ...     ...  ...                ...                     ...   
1995        1996  Female   71             184387                      40   
1996        1997  Female   91              73158                      32   
1997        1998    Male   87              90961                      14   
1998        1999    Male   77             182109                       4   
1999        2000    Male   90             110610                      52   

         Profession  Work Experience  Family Size  
0

In [7]:
df.isnull().sum()

CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                35
Work Experience            0
Family Size                0
dtype: int64

In [13]:
null_percentage = df.isnull().sum() / len(df) * 100

In [14]:
null_percentage

CustomerID                0.00
Gender                    0.00
Age                       0.00
Annual Income ($)         0.00
Spending Score (1-100)    0.00
Profession                1.75
Work Experience           0.00
Family Size               0.00
dtype: float64

In [8]:
df.duplicated().sum()

0

In [16]:
df.dropna(subset=['Profession'], inplace=True)


In [17]:
df.isnull().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income ($)         0
Spending Score (1-100)    0
Profession                0
Work Experience           0
Family Size               0
dtype: int64

In [25]:
from sklearn.preprocessing import LabelEncoder

df_encoded = pd.get_dummies(df, columns=['Gender'], drop_first=True)

label_encoder = LabelEncoder()
df_encoded['Profession'] = label_encoder.fit_transform(df_encoded['Profession'])

In [26]:
df_encoded.columns

Index(['CustomerID', 'Age', 'Annual Income ($)', 'Spending Score (1-100)',
       'Profession', 'Work Experience', 'Family Size', 'Gender_Male'],
      dtype='object')

In [42]:
df_encoded.head()

Unnamed: 0,CustomerID,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size,Gender_Male
0,1,19,15000,39,5,1,4,True
1,2,21,35000,81,2,3,3,True
2,3,20,86000,6,2,1,1,False
3,4,23,59000,77,7,0,2,False
4,5,31,38000,40,3,2,6,False


***EDA***

In [35]:
fig_age = px.scatter(df, x='Age', y='Spending Score (1-100)',
                     title='Spending Score vs. Age',
                     labels={'Age': 'Age', 'Spending Score (1-100)': 'Spending Score'},
                     trendline='ols')
fig_age.show()

fig_income = px.scatter(df_encoded, x='Annual Income ($)', y='Spending Score (1-100)',
                       title='Spending Score vs. Annual Income',
                       labels={'Annual Income ($)': 'Annual Income ($)', 'Spending Score (1-100)': 'Spending Score'},
                       trendline='ols')
fig_income.show()

df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})

fig = px.box(df_encoded, x='Gender_Male', y='Spending Score (1-100)',
             title='Spending Score by Gender',
             labels={'Gender': 'Gender', 'Spending Score (1-100)': 'Spending Score'},
             category_orders={'Gender': [0, 1]})

fig.update_xaxes(tickmode='array', tickvals=[0, 1], ticktext=['Female', 'Male'])

fig.show()

In [36]:
fig_familia = px.scatter(df_encoded, x='Family Size', y='Spending Score (1-100)',
                 title='Spending Score vs. Family Size',
                 labels={'Family Size': 'Family Size', 'Spending Score (1-100)': 'Spending Score'},
                 trendline='ols')


fig_familia.show()

In [37]:
fig_work = px.scatter(df_encoded, x='Work Experience', y='Spending Score (1-100)',
                 title='Spending Score vs. Work Experience',
                 labels={'Work Experience': 'Work Experience (years)', 'Spending Score (1-100)': 'Spending Score'},
                 trendline='ols')

fig_work.show()

In [53]:
fig_pro = px.scatter(df_encoded, x='Profession', y='Spending Score (1-100)',
                 title='Spending Score vs. Profession',
                 labels={'Profession': 'Profession', 'Spending Score (1-100)': 'Spending Score'},
                 color='Profession',
                 category_orders={'Profession': df['Profession'].unique()})

fig_pro.show()

In [44]:
df_encoded['High Spending Score'] = (
    (df_encoded['Annual Income ($)'] > 50000) & 
    (df_encoded['Family Size'] < 8) & 
    (df_encoded['Work Experience'] <= 10)
)

fig_final = px.scatter(df_encoded, x='Annual Income ($)', y='Spending Score (1-100)',
                 title='Spending Score vs. Annual Income',
                 labels={'Annual Income ($)': 'Annual Income ($)', 'Spending Score (1-100)': 'Spending Score'},
                 color='High Spending Score',
                 symbol='High Spending Score',
                 color_discrete_map={True: 'green', False: 'red'},
                 symbol_sequence=['circle', 'x'])

fig_final.update_traces(marker=dict(size=12, opacity=0.6, line=dict(width=2, color='DarkSlateGrey')))


fig_final.show()

***ML Modelling***

In [61]:
from sklearn.model_selection import train_test_split

features = df_encoded[['Annual Income ($)', 'Family Size', 'Work Experience']]
X_train, X_test = train_test_split(features, test_size=0.2, random_state=42)


**1. No feature creation.**

In [81]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

kmeans = KMeans(n_clusters=8, random_state=42)
kmeans.fit(X_train_scaled)

test_clusters = kmeans.predict(X_test_scaled)

train_clusters = kmeans.predict(X_train_scaled)
silhouette_avg = silhouette_score(X_train_scaled, train_clusters)
print(f'Silhouette Score: {silhouette_avg}')


Silhouette Score: 0.3147078974578222






**2. Feature creation using Family Size and Work Exp. Results not desirable.**

In [80]:
from sklearn.decomposition import PCA
df_encoded['Family_Work_Prod'] = df_encoded['Family Size'] * df_encoded['Work Experience']
df_encoded['Family_Work_Sum'] = df_encoded['Family Size'] + df_encoded['Work Experience']
df_encoded['Family_Work_Ratio'] = df_encoded['Family Size'] / (df_encoded['Work Experience'] + 1e-5)


features = df_encoded[['Annual Income ($)', 'Family Size', 'Work Experience', 'Family_Work_Prod', 'Family_Work_Sum', 'Family_Work_Ratio']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(X_pca)
clusters = kmeans.predict(X_pca)

silhouette_avg = silhouette_score(X_pca, clusters)
print(f'Silhouette Score: {silhouette_avg}')





Silhouette Score: 0.3188861688479774


**3.Feature Creation using Family Size and Annual Income. Silhoutte score 92%.**

In [82]:
df_encoded['Family_Income_Product'] = df_encoded['Family Size'] * df_encoded['Annual Income ($)']
df_encoded['Family_Income_Ratio'] = df_encoded['Family Size'] / (df_encoded['Annual Income ($)'] + 1e-5) 

scaler = StandardScaler()
df_encoded[['Annual Income ($)', 'Family Size', 'Family_Income_Product', 'Family_Income_Ratio']] = scaler.fit_transform(
    df_encoded[['Annual Income ($)', 'Family Size', 'Family_Income_Product', 'Family_Income_Ratio']]
)

kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(df_encoded)
clusters = kmeans.predict(df_encoded)

silhouette_avg = silhouette_score(df_encoded, clusters)
print(f'Silhouette Score: {silhouette_avg}')





Silhouette Score: 0.9206900797149205


In [84]:
import pickle
with open('/kaggle/working/kmeans_model.pkl', 'wb') as file:
    pickle.dump(kmeans, file)

with open('/kaggle/working/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)