# Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/customer-segmentation/Train.csv')
df.head()

In [None]:
df.info()

In [None]:
# Inspect Missing Value

print('Mising Value Count in Each Category:')
print(df.isna().sum())

In [None]:
plt.subplots(figsize=(10,5))
sns.heatmap(df.isnull(),cbar=False)
plt.xlabel('Feature')
plt.ylabel('Index')
plt.title('Missing Value Visualization')

plt.show()

In [None]:
# Visualize Target Label Distribution

plt.subplots(figsize=(10,5))
sns.countplot(df['Segmentation'].sort_values())
plt.xlabel('Target Label')
plt.ylabel('Count')
plt.title('Target Label Comparison')

plt.show()

# Data Preparation

## Exploratory Data Analysis 

In [None]:
# Customer Type Based On Gender

gender_based = pd.pivot_table(df,values='ID',index=['Gender'],columns=['Segmentation'],aggfunc=np.count_nonzero)

In [None]:
gender_based

In [None]:
x = np.arange(len(gender_based.columns))
width = 0.25

fig,ax = plt.subplots(figsize=(10,5))
for i in range(len(gender_based)):
    ax.bar(x+(i*width),gender_based.loc[gender_based.index[i]],width=width, label=gender_based.index[i])

ax.set_xticks(x+((len(gender_based)/2-0.5)*width))
ax.set_xticklabels(gender_based.columns)

plt.title('Consumer Type Based On Gender')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Count')

plt.legend(loc='best',frameon=True)
plt.show()

In [None]:
# Customer Type Based On Ever Married or not

married_based = pd.pivot_table(df,values='ID',index=['Ever_Married'],columns=['Segmentation'],aggfunc=np.count_nonzero)

In [None]:
married_based

In [None]:
x = np.arange(len(married_based.columns))
width = 0.25

fig,ax = plt.subplots(figsize=(10,5))
for i in range(len(married_based)):
    ax.bar(x+(i*width),married_based.loc[married_based.index[i]],width=width, label=married_based.index[i])

ax.set_xticks(x+((len(married_based)/2-0.5)*width))
ax.set_xticklabels(married_based.columns)

plt.title('Consumer Type Based On Ever Married?')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Count')

plt.legend(loc='best',frameon=True)
plt.show()

In [None]:
# Customer Type Based On Age

age_based = pd.DataFrame(df.groupby('Segmentation')['Age'].mean())

In [None]:
age_based

In [None]:
fig,ax = plt.subplots(figsize=(10,5))
ax.bar(age_based.index,age_based['Age'],width=0.5)

plt.title('Consumer Type Based On Age')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Age Average')
plt.show()

In [None]:
# Customer Type Based On Have Been Graduated or not

grad_based = pd.pivot_table(df,values='ID',index=['Graduated'],columns=['Segmentation'],aggfunc=np.count_nonzero)

In [None]:
grad_based

In [None]:
x = np.arange(len(grad_based.columns))
width = 0.25

fig,ax = plt.subplots(figsize=(10,5))
for i in range(len(grad_based)):
    ax.bar(x+(i*width),grad_based.loc[grad_based.index[i]],width=width, label=grad_based.index[i])

ax.set_xticks(x+((len(grad_based)/2-0.5)*width))
ax.set_xticklabels(grad_based.columns)

plt.title('Consumer Type Based On Have Graduated?')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Count')

plt.legend(loc='best',frameon=True)
plt.show()

In [None]:
# Customer Type Based On Profession

prof_based = pd.pivot_table(df,values='ID',index=['Profession'],columns=['Segmentation'],aggfunc=np.count_nonzero)

In [None]:
prof_based

In [None]:
x = np.arange(len(prof_based.columns))
width = 0.1

fig,ax = plt.subplots(figsize=(10,5))
for i in range(len(prof_based)):
    ax.bar(x+(i*width),prof_based.loc[prof_based.index[i]],width=width, label=prof_based.index[i])

ax.set_xticks(x+((len(prof_based)/2-0.5)*width))
ax.set_xticklabels(prof_based.columns)

plt.title('Consumer Type Based On Profession')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Count')

plt.legend(loc='best',bbox_to_anchor=(0.92,0.40,0.3,0.6),frameon=True)
plt.show()

In [None]:
# Customer Type Based On Work Experience

workexp_based = pd.DataFrame(df.groupby('Segmentation')['Work_Experience'].mean())

In [None]:
workexp_based

In [None]:
fig,ax = plt.subplots(figsize=(10,5))
ax.bar(workexp_based.index,workexp_based['Work_Experience'],width=0.5)

plt.title('Consumer Type Based On Work Experience')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Work Experience Average')
plt.show()

In [None]:
# Customer Type Based On Spending Score

spend_based = pd.pivot_table(df,values='ID',index=['Spending_Score'],columns=['Segmentation'],aggfunc=np.count_nonzero)

In [None]:
spend_based.sort_values(by='A',ascending=False,inplace=True)

In [None]:
spend_based

In [None]:
x = np.arange(len(spend_based.columns))
width = 0.2

fig,ax = plt.subplots(figsize=(10,5))
for i in range(len(spend_based)):
    ax.bar(x+(i*width),spend_based.loc[spend_based.index[i]],width=width, label=spend_based.index[i])

ax.set_xticks(x+((len(spend_based)/2-0.5)*width))
ax.set_xticklabels(spend_based.columns)

plt.title('Consumer Type Based On Spending Score')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Count')

plt.legend(loc='best',frameon=True)
plt.show()

In [None]:
# Customer Type Based On Family Size

famsize_based = pd.DataFrame(df.groupby('Segmentation')['Family_Size'].agg(pd.Series.mode))

In [None]:
famsize_based

In [None]:
fig,ax = plt.subplots(figsize=(10,5))
ax.bar(famsize_based.index,famsize_based['Family_Size'],width=0.5)

plt.title('Consumer Type Based On Family Size')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Family Size Most Frequent')
plt.show()

In [None]:
# Customer Type Based On Var 1

var_based = pd.pivot_table(df,values='ID',index=['Var_1'],columns=['Segmentation'],aggfunc=np.count_nonzero)

In [None]:
var_based

In [None]:
x = np.arange(len(var_based.columns))
width = 0.1

fig,ax = plt.subplots(figsize=(10,5))
for i in range(len(var_based)):
    ax.bar(x+(i*width),var_based.loc[var_based.index[i]],width=width, label=var_based.index[i])

ax.set_xticks(x+((len(var_based)/2-0.5)*width))
ax.set_xticklabels(var_based.columns)

plt.title('Consumer Type Based On Var 1')
ax.set_xlabel('Consumer Type')
ax.set_ylabel('Count')

plt.legend(loc='best',bbox_to_anchor=(0.84,0.42,0.3,0.6),frameon=True)
plt.show()

## Preprocessing 

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Extract Var 1 Code Number

In [None]:
def var_code(x):
    y = x.str.split("_",expand=True)[1].astype('float64',errors='ignore')
    return y

In [None]:
df['Var_1'] = var_code(df['Var_1'])

### Missing Value Treatment in Categorical Columns

In [None]:
def null_drop(df,null_col_drop):
    df = df.dropna(subset=null_col_drop).reset_index(drop=True)
    return df

In [None]:
null_col_drop = ['Ever_Married','Graduated','Profession','Var_1']

In [None]:
df = null_drop(df,null_col_drop)

### Target Encoding 

In [None]:
target_enc = OrdinalEncoder([
        ['A', 'B',  'C', 'D']])

In [None]:
def labelencoder(df,target_column):
    target_enc.fit(df[target_column])
    df[target_column] = target_enc.transform(df[target_column])
    return df

In [None]:
target_column = ['Segmentation']

In [None]:
df = labelencoder(df,target_column)

In [None]:
df.head()

### Dataset Splitting 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(columns=['ID','Segmentation'])
y = df.Segmentation

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

### Pipeline Building

In [None]:
X_train.columns

In [None]:
cat_ord_pipe = Pipeline([
    ('encoder',OrdinalEncoder()),
])

cat_one_pipe = Pipeline([
    ('encoder',OneHotEncoder())
])

num_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
])

In [None]:
prepro = ColumnTransformer([
    ('numeric',num_pipe,['Age','Work_Experience','Family_Size']),
    ('categoric_ord',cat_ord_pipe,['Gender','Ever_Married','Graduated','Var_1','Profession','Spending_Score']),
])

# Modeling

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipeline = Pipeline([
    ('prepro',prepro),
    ('algo',RandomForestClassifier(n_jobs=-1,random_state=42))
])

In [None]:
param_rf = {
    'algo__n_estimators':[200],
    'algo__max_depth':[6],
    'algo__max_features':[0.5],
    'algo__min_samples_leaf':[63],
    'algo__class_weight':[{0:0.34,
                           1:0.48,
                           2:0.39,
                           3:0.2}]
}

In [None]:
model = GridSearchCV(pipeline,param_rf,cv=5,n_jobs=-1,verbose=1)
model.fit(X_train,y_train)

print(model.best_params_)
print("Train data accuracy score: ", model.score(X_train,y_train))
print("Test data accuracy score: ", model.score(X_test,y_test))

# Prediction

In [None]:
df_test = pd.read_csv('../input/customer-segmentation/Test.csv')
df_test.head()

In [None]:
df_test['Var_1'] = var_code(df_test['Var_1'])

In [None]:
df_test = null_drop(df_test,null_col_drop)

In [None]:
df_test = labelencoder(df_test,target_column)

In [None]:
df_test = df.drop(columns=['ID','Segmentation'])
y_df_test = df.Segmentation

# Evaluation

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report

In [None]:
y_test_pred = model.predict(df_test)

In [None]:
y_test_proba = model.predict_proba(df_test)
pd.DataFrame(y_test_proba)

In [None]:
print("Test data prediction accuracy score: ", model.score(df_test,y_df_test))

In [None]:
# Classification Report

print(classification_report(y_test_pred,y_df_test))

In [None]:
# Confusion Matrix

fig,ax = plt.subplots(figsize=(10,5))
plot_confusion_matrix(model,df_test,y_df_test,cmap=plt.cm.Blues,normalize='true',ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix (Normalized)')