In [10]:
# Data
import pandas as pd
# ML
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

## Load

In [12]:
pofile_df = pd.read_csv('../../data/processed/profile.csv')

## Add features

In [13]:

n_clusters = 5
random_state=42

# Select relevant features
numerical_features = ['age', 'income']

# Create a copy of the input dataframe
X = pofile_df.copy()

# One-hot encode the gender column
gender_encoded = pd.get_dummies(X['gender'], prefix='gender', dummy_na=True)

# Combine numerical features and encoded gender
X_combined = pd.concat([X[numerical_features], gender_encoded], axis=1)

# Create a pipeline that includes imputation and scaling
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X_combined)

# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
cluster_labels = kmeans.fit_predict(X_preprocessed)

# Add cluster labels to the original dataframe
df_with_clusters = pofile_df.copy()
df_with_clusters['customer_type'] = cluster_labels

df_with_clusters.head()

Unnamed: 0,gender,age,profile_id,income,become_member_on_date,customer_type
0,,118,68be06ca386d4c31939f3a4f0e3dd783,,2017-02-12,2
1,F,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2017-07-15,4
2,,118,38fe809add3b4fcf9315a9694bb96ff5,,2018-07-12,2
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2017-05-09,4
4,,118,a03223e636434f42ac4c3df47e8bac43,,2017-08-04,2


## Save

In [14]:
df_with_clusters.to_csv("../../data/features/profile.csv", index=False)