### IMPORT LIBRARIES


In [336]:
import pandas as pd
import numbers as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
import matplotlib.pyplot as plt

### REMOVE UNNECCESARY COLUMNS


In [337]:
df = pd.read_csv("Consumer_Dataset.csv")
df = df.drop(columns='Unnamed: 0')
df = df.drop(columns='Group')
df

Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable
0,Male,22,No,4.0,Healthcare,No,1.0,Low,Hydro
1,Female,38,Yes,3.0,Engineer,Yes,,Average,Hydro
2,Female,67,Yes,1.0,Engineer,Yes,1.0,Low,Solar
3,Male,67,Yes,2.0,Lawyer,Yes,0.0,High,Solar
4,Female,40,Yes,6.0,Entertainment,Yes,,High,Solar
...,...,...,...,...,...,...,...,...,...
8063,Male,22,No,7.0,,No,0.0,Low,Tidal
8064,Male,35,No,4.0,Executive,No,3.0,Low,Hydro
8065,Female,33,No,1.0,Healthcare,Yes,1.0,Low,Solar
8066,Female,27,No,4.0,Healthcare,Yes,1.0,Low,Solar


In [338]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Gender               8068 non-null   object 
 1   Age                  8068 non-null   int64  
 2   Ever_Married         7928 non-null   object 
 3   Family_Size          7733 non-null   float64
 4   Profession           7944 non-null   object 
 5   Graduated            7990 non-null   object 
 6   Work_Experience      7239 non-null   float64
 7   Energy_Consumption   8068 non-null   object 
 8   Preferred_Renewable  7992 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 567.4+ KB


### REMOVE ROWS WITH BLANK CELLS


In [339]:
df = df.dropna(subset=['Ever_Married'])
df = df.dropna(subset=['Family_Size'])
df = df.dropna(subset=['Profession'])
df = df.dropna(subset=['Graduated'])
df = df.dropna(subset=['Work_Experience'])
df = df.dropna(subset=['Preferred_Renewable'])
df_final = df.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6665 entries, 0 to 8067
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Gender               6665 non-null   object 
 1   Age                  6665 non-null   int64  
 2   Ever_Married         6665 non-null   object 
 3   Family_Size          6665 non-null   float64
 4   Profession           6665 non-null   object 
 5   Graduated            6665 non-null   object 
 6   Work_Experience      6665 non-null   float64
 7   Energy_Consumption   6665 non-null   object 
 8   Preferred_Renewable  6665 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 520.7+ KB


### LABEL ENCODING


In [340]:
label_encoder = LabelEncoder()

df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Ever_Married'] = label_encoder.fit_transform(df['Ever_Married'])
df['Graduated'] = label_encoder.fit_transform(df['Graduated'])

mapping = {'high': 2, 'average': 1, 'low': 0}
df['Energy_Consumption'] = df['Energy_Consumption'].apply(
    lambda x: mapping[x.lower().strip()] if x.lower().strip() in mapping else x)
df

Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable
0,1,22,0,4.0,Healthcare,0,1.0,0,Hydro
2,0,67,1,1.0,Engineer,1,1.0,0,Solar
3,1,67,1,2.0,Lawyer,1,0.0,2,Solar
5,1,56,1,2.0,Artist,0,0.0,1,Solar
6,1,32,0,3.0,Healthcare,1,1.0,0,Solar
...,...,...,...,...,...,...,...,...,...
8062,1,41,1,5.0,Artist,1,0.0,2,Solar
8064,1,35,0,4.0,Executive,0,3.0,0,Hydro
8065,0,33,0,1.0,Healthcare,1,1.0,0,Solar
8066,0,27,0,4.0,Healthcare,1,1.0,0,Solar


### ONE HOT ENCODING


In [341]:
onehot_features = ['Preferred_Renewable', 'Profession']
# drop='first' to avoid dummy trap
encoder = OneHotEncoder(drop='first', sparse=False)

# Fit and transform the categorical columns
encoded_columns = encoder.fit_transform(df[onehot_features])
onehot_feature_names = encoder.get_feature_names_out(onehot_features)
encoded_df = pd.DataFrame(encoded_columns, columns=onehot_feature_names)
encoded_df.reset_index(drop=True, inplace=True)
df_encoded = pd.concat(
    [df.drop(onehot_features, axis=1).reset_index(drop=True), encoded_df], axis=1)
df_encoded



Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,...,Preferred_Renewable_Tidal,Preferred_Renewable_Wind,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,1,22,0,4.0,0,1.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,67,1,1.0,1,1.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,67,1,2.0,1,0.0,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,56,1,2.0,0,0.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,32,0,3.0,1,1.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6660,1,41,1,5.0,1,0.0,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6661,1,35,0,4.0,0,3.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6662,0,33,0,1.0,1,1.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6663,0,27,0,4.0,1,1.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### SCALING


In [342]:
features = df_encoded.columns
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_encoded[features])
df_scaled = pd.DataFrame(df_scaled, columns=features)
df_scaled

Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,...,Preferred_Renewable_Tidal,Preferred_Renewable_Wind,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,0.901454,-1.303415,-1.203938,0.760113,-1.326157,-0.478430,-0.743351,-0.10596,2.617329,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329
1,-1.109319,1.420092,0.830607,-1.207580,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.125902,-0.324228,-0.312219,3.232938,-0.371684,-0.286322,-0.439015,-0.164209,-0.284786,-0.190329
2,0.901454,1.420092,0.830607,-0.551682,0.754059,-0.772106,1.956614,-0.10596,-0.382069,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,-0.439015,-0.164209,3.511410,-0.190329
3,0.901454,0.754346,0.830607,-0.551682,-1.326157,-0.772106,0.606631,-0.10596,-0.382069,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,-0.439015,-0.164209,-0.284786,-0.190329
4,0.901454,-0.698191,-1.203938,0.104215,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6660,0.901454,-0.153490,0.830607,1.416011,0.754059,-0.772106,1.956614,-0.10596,-0.382069,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,-0.439015,-0.164209,-0.284786,-0.190329
6661,0.901454,-0.516624,-1.203938,0.760113,-1.326157,0.108922,-0.743351,-0.10596,2.617329,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,3.492566,-0.439015,-0.164209,-0.284786,-0.190329
6662,-1.109319,-0.637669,-1.203938,-1.207580,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329
6663,-1.109319,-1.000803,-1.203938,0.760113,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.125902,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329


### CLUSTERING


In [343]:
kmeans = KMeans(n_clusters=4, random_state=4)
cluster_labels = kmeans.fit_predict(df_scaled)
df_scaled['Cluster'] = cluster_labels
df_scaled

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable_Geothermal,Preferred_Renewable_Hydro,Preferred_Renewable_Not Interested,...,Preferred_Renewable_Wind,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Cluster
0,0.901454,-1.303415,-1.203938,0.760113,-1.326157,-0.478430,-0.743351,-0.10596,2.617329,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329,3
1,-1.109319,1.420092,0.830607,-1.207580,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.324228,-0.312219,3.232938,-0.371684,-0.286322,-0.439015,-0.164209,-0.284786,-0.190329,0
2,0.901454,1.420092,0.830607,-0.551682,0.754059,-0.772106,1.956614,-0.10596,-0.382069,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,-0.439015,-0.164209,3.511410,-0.190329,2
3,0.901454,0.754346,0.830607,-0.551682,-1.326157,-0.772106,0.606631,-0.10596,-0.382069,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,-0.439015,-0.164209,-0.284786,-0.190329,0
4,0.901454,-0.698191,-1.203938,0.104215,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6660,0.901454,-0.153490,0.830607,1.416011,0.754059,-0.772106,1.956614,-0.10596,-0.382069,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,-0.439015,-0.164209,-0.284786,-0.190329,0
6661,0.901454,-0.516624,-1.203938,0.760113,-1.326157,0.108922,-0.743351,-0.10596,2.617329,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,3.492566,-0.439015,-0.164209,-0.284786,-0.190329,1
6662,-1.109319,-0.637669,-1.203938,-1.207580,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329,3
6663,-1.109319,-1.000803,-1.203938,0.760113,0.754059,-0.478430,-0.743351,-0.10596,-0.382069,-0.15982,...,-0.324228,-0.312219,-0.309316,-0.371684,-0.286322,2.277825,-0.164209,-0.284786,-0.190329,3


### ADDING OUTPUT COLUMN


In [344]:
df_final['Cluster'] = cluster_labels
mapping = {3: 'D', 2: 'B', 1: 'C', 0: 'A'}
df_final['Cluster'] = df_final['Cluster'].map(mapping)
df_final

Unnamed: 0,Gender,Age,Ever_Married,Family_Size,Profession,Graduated,Work_Experience,Energy_Consumption,Preferred_Renewable,Cluster
0,Male,22,No,4.0,Healthcare,No,1.0,Low,Hydro,D
2,Female,67,Yes,1.0,Engineer,Yes,1.0,Low,Solar,A
3,Male,67,Yes,2.0,Lawyer,Yes,0.0,High,Solar,B
5,Male,56,Yes,2.0,Artist,No,0.0,Average,Solar,A
6,Male,32,No,3.0,Healthcare,Yes,1.0,Low,Solar,D
...,...,...,...,...,...,...,...,...,...,...
8062,Male,41,Yes,5.0,Artist,Yes,0.0,High,Solar,A
8064,Male,35,No,4.0,Executive,No,3.0,Low,Hydro,C
8065,Female,33,No,1.0,Healthcare,Yes,1.0,Low,Solar,D
8066,Female,27,No,4.0,Healthcare,Yes,1.0,Low,Solar,D
