In [None]:
#Import required libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Data Visualization
import seaborn as sns #Data Visualization
from pandas.api.types import is_string_dtype #Check Datatype
from pandas.api.types import is_numeric_dtype #Check Datatype
from sklearn.preprocessing import LabelEncoder #preProcessing data
import os

# Load Directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read the Training dataset
data=pd.read_csv('../input/customer-segmentation/Train.csv')
data.head()

In [None]:
# % of missing values
round(data.isna().sum()/len(data)*100,2)

In [None]:
# Study the data
data.info()

as above, we see that data contains some missing values. there are total of 8068 entries, but some columns have less than 8068 entries, which means they have missing values.

In [None]:
#data contains 8068 rows and 11 columns.
data.shape

In [None]:
data.describe()

Columns 'Work Experience' and 'Family Size' have huge number of null values. Continuous features can be replaced with their mean values. Rest other columns have minimal number of null values, so for now, we will leave them as-is.

In [None]:
data.fillna(value={"Work_Experience":data["Work_Experience"].mean(), "Family_Size":data["Family_Size"].mean(), "Ever_Married": data["Ever_Married"].mode()[0], "Graduated": data["Graduated"].mode()[0], "Profession": data["Profession"].mode()[0]}, inplace=True)

In [None]:
data.drop(["Var_1","ID"], axis=1, inplace=True)
data.info()

# Univariate Analysis

In [None]:
for col in data.columns:
    plt.figure(col, figsize=(5,5))
    plt.title(col)
    if is_numeric_dtype(data[col]):
        data[col].plot(kind="hist")
        plt.show()
        sns.boxplot(data=data, x=col)
        
        
    if is_string_dtype(data[col]):
        sns.countplot(x=col, data=data, order=data[col].value_counts().index)
        plt.show()

### Findings:

* There are more males than females in the dataset
* There are more married than un-married people
* Dataset contains more people in age range of 30-50 & data is not normally distributed and has some outliers. Similarly, for experience and family size. where mostly people have 0-4 years of experience & families mainly has 2-4 people
* There are more graduated people & artist in the dataset
* People tend to have lower spendings in this dataset.
* Dataset in balanced across all segments as shown in last graph.

There are not many outliers, so I would leave them

# Bi-Variate Analysis

### For numerical variables

In [None]:
#Use Pairplot to identify the relationship between numerical-variables
sns.pairplot(data, vars=['Age','Work_Experience','Family_Size'], diag_kind='hist', palette = "bright", hue='Segmentation')

I don't see any kind of relationship between numerical variables.

### With numerical & categorical variables

In [None]:
for ycolumns in ['Age','Work_Experience','Family_Size']:
    plt.figure(ycolumns, figsize=(5,5))
    for hcolumns in ['Gender','Profession','Spending_Score', 'Ever_Married']:
        sns.boxplot(data=data, x="Segmentation", y=ycolumns, hue=hcolumns)
        plt.show()

Age - 
D group has people of lower age group (20-40) while B,C, D have 40-60. A & B have more males than females while D & C both have almost equal number of males and females.

Profession & Age: Lawyers belong to higher age group across all 4 segments. Where in A & B, engineers belong to higher age group with B has more aged engineers than A. 

Spending & Age: Across all 4 groups, people with high spending power have wide age distribution. Youngest people with low spending power mainly belongs to segment D. Segment C has less overlapping age groups across different spending score groups.

Married & Age: Across all age groups, people who are married are older than people who are not married which is obvious. People in group D who are not married and youngest. Group B also has some people who are not married yet & are 45 age. 

Females are significantly more experienced than males in segments D, B however females in groups A, C are slightly higher experienced than males.

# Feature Scaling

LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels. Where there is order in values we use LabelEncoder() of sklearn else we use get_dummies() from pandas.

In [None]:
data_copy=data.copy()
data["Spending_Score"]=LabelEncoder().fit_transform(data["Spending_Score"])

for columns in ["Gender","Ever_Married","Graduated","Profession"]:
    new_data=pd.get_dummies(data[columns], prefix=columns)
    data=pd.concat([data,new_data], axis=1)
    data.drop([columns], axis=1, inplace=True)

data.drop(["Gender_Female","Ever_Married_No","Graduated_No","Profession_Marketing"],axis=1, inplace=True)
print(data.head())
    

### Corelation between features

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
for val in ["Yes","No"]:
    plt.title(val)
    sns.countplot(x=data_copy[data_copy["Ever_Married"]==val]["Segmentation"], hue=data_copy["Spending_Score"])
    plt.show()

It's obvious to see that people who are not married have low spending score and mainly belong to segment D. On the other hand those who are married & have low spending score mainly belongs to group A & D while B & C have more people with average spending score.

In [None]:
sns.displot(data=data_copy,x="Age",kde=True, hue=data_copy["Ever_Married"])
plt.show()

# Modelling Data - Logistic Regression

In [None]:
# Import libraries related to modelling
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#define X as set of independent features and y as dependent feature
X=data.drop(['Segmentation'], axis=1)
y=data['Segmentation']

#Use Standard Scaler to normalize data on one scale.
scaler=StandardScaler()
X=scaler.fit_transform(X)

# Devide dataset into testing (20%) and training (80%)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=42)

#as it is a multiclass problem, so we use LR with parameters relevant for the problem
Kfold_val=KFold(10)

# Initialize Models
LogReg= LogisticRegression(penalty='l2', max_iter=5000, multi_class='ovr', solver='liblinear')
dtc=DecisionTreeClassifier(random_state=0)
rfc=RandomForestClassifier(max_depth=8, random_state=0)
svm_c=svm.SVC(kernel='linear', C=1, decision_function_shape='ovo')
knn=KNeighborsClassifier(n_neighbors=3)
bnb=BernoulliNB()
models=[LogReg,dtc, rfc, svm_c, knn, bnb ]
models_name={0: 'Logistic Regression',1:'Decision Tree Classification', 2: 'Random Forest Classification', 3:'SVM', 4:'KNN', 5: 'Bernouli Naive'}

model_sc={}
def model_pred(model, i):
    model.fit(X_train, y_train)
    model_predc=model.predict(X_test)
    model_cv=cross_val_score(model, X, y, cv=Kfold_val)
    model_cv_sc=np.mean(model_cv)
    model_sc[models_name[i]]=model_cv_sc
    print("Confusion Matrix for {} is: \n {}".format(models_name[i],confusion_matrix(y_test,model_predc)))
    print(classification_report(y_test,model_predc))
    print(model_cv)
    print(np.mean(model_cv))
    print("Accuracy Score is {}".format(accuracy_score(y_test, model_predc)))

    
for i, model_s in enumerate(models):
    model_pred(model_s, i)
    
print(model_sc)

In [None]:
for x in model_sc:
    print(x, ':', round(model_sc[x]*100 ))

# Conclusion

From the test results, we can choose Random forest, with 53% accuracy, as our model. It gave us more accurate results since it is an ensemble model. 

Thanks for reading, I appreciate feedback!
