In [70]:
# !pip install plotnine
# !pip install yellowbrick

In [71]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer

from sklearn.cluster import DBSCAN, KMeans, KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy.cluster import hierarchy
from scipy.spatial import distance
from sklearn.compose import ColumnTransformer

import plotly.express as px

plt.figure(figsize=(15,8))
sns.set(rc = {'figure.figsize':(15,8)})

In [72]:
def plot_DBSCAN(df, eps, min_samples, plot=False):
    labels = DBSCAN(eps=eps, min_samples=min_samples).fit(df).labels_
    hue = [str(i) for i in labels]
    print(silhouette_score(df, labels))
    if plot:
        sns.scatterplot(x=df[:, 0], y=df[:, 1], hue=hue)
        plt.show()
    return labels

def plot_KMeans(df, n_clusters, plot=False):
    kmeans = KMeans(n_clusters=n_clusters).fit(df)
    hue = [str(i) for i in kmeans.labels_]
    print(silhouette_score(df, kmeans.labels_))
    if plot:
        sns.scatterplot(x=df[:, 0], y=df[:, 1], hue=hue)
        plt.show()
    return kmeans.labels_


def plot_Agg(df, n_clusters=2, plot=False):
    clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(df)
    hue = [str(i) for i in clustering.labels_]
    print(silhouette_score(df, clustering.labels_))
    if plot:
        sns.scatterplot(x=df[:, 0], y=df[:, 1], hue=hue)
        plt.show()
    return clustering.labels_

def plot_gaussian(df_scaled, k):
    gm = GaussianMixture(k)
    gm.fit(df_scaled)
    labels = gm.predict(df_scaled)
    print(silhouette_score(df_scaled, labels))
    return labels

def elbow(df, k=7):
    model = KElbowVisualizer(KMeans(), k=k)
    model.fit(df)
    model.show()


def plot_1d(pca_df, labels):
    sns.scatterplot(pca_df[:, 0], np.zeros_like(pca_df[:, 0]), hue=labels)
    plt.show()

In [73]:
train_df = pd.read_csv("Data.csv")
test_df = pd.read_csv("Test.csv")

In [74]:
train_df.head()

In [75]:
test_df.head()

In [43]:
class Data_Analysis:
    def view_dataset(self, df):
        print(df.head())
    def checkk_nan(self, df):
        print(df.isna().sum())
    
    def remove_nan(self, df):
        print("Shape of dataset before removing missing values: ", df.shape)
        df.dropna(inplace=True)
        print("Shape of dataset after removing missing values: ", df.shape)
        return df
        
da = Data_Analysis()

In [44]:
# da.view_dataset(train_df)
# da.checkk_nan(train_df)
train_df = da.remove_nan(train_df)


In [45]:
train_df.describe()

In [46]:
train_df.info()

In [47]:
train_df.describe()

In [48]:
train_df.columns

<h1>Uni Variate Analysis</h1>

In [49]:
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
sns.countplot(x=train_df['Gender'])
plt.title("Distributions of Gender")

plt.subplot(1,2,2)
sns.countplot(x=train_df['Ever_Married'])
plt.title("Distributions of Ever Married")
plt.show()

In [50]:
sns.countplot(x=train_df['Graduated'])
plt.title("Distributions of Graduated")
plt.show()

In [51]:
sns.countplot(x=train_df['Profession'])
plt.title("Distributions of Profession")
plt.xticks(rotation=45)
plt.show()

In [52]:
sns.displot(data = train_df, x='Age', kde=True, height=8, aspect=15/8)

In [53]:
plt.figure(figsize=(10,5))
sns.histplot(train_df['Work_Experience'])
plt.title("Work Experience Distributions")
plt.show()

In [54]:
plt.figure(figsize=(6,4))
sns.histplot(train_df['Family_Size'])
plt.title("Family Size Distributions")
plt.show()

In [55]:
sns.countplot(x=train_df['Spending_Score'])
plt.title("Spending Score Distributions")
plt.show()

In [56]:
sns.countplot(x=train_df['Var_1'])
plt.title("Variable Distributions")
plt.show()

In [57]:
train_df['Var_1'].value_counts().plot(kind="pie",autopct="%.1f%%")
plt.title("Variable Distributions")
plt.legend()
plt.show()

In [58]:
sns.countplot(x=train_df['Segmentation'])
plt.title("Segmentation Distributions")
plt.show()

In [59]:
train_df['Segmentation'].value_counts().plot(kind="pie",autopct="%.1f%%")
plt.title("Segmentation Distributions")
plt.show()

<h1>Bi-Variate Analysis</h1>

In [60]:
sns.pairplot(data=train_df[['Age', 'Graduated', 'Work_Experience', 'Family_Size',
       'Segmentation']],hue='Segmentation',palette=['Red','Blue','Green','Yellow']);
plt.show()

<h1>Age</h2>

In [61]:
print(train_df['Age'].min())
print(train_df['Age'].max())
print(train_df['Age'].mean())
print(train_df['Age'].median())
print(train_df['Age'].mode()[0])

In [62]:
sns.displot(data = train_df, x='Age', row='Profession', kde=True, height=8, aspect=15/8)


In [63]:
dummy_df = train_df.copy()
dummy_df['Age'] = pd.cut(dummy_df['Age'],
                     bins=[0, 15, 25, 35, 45, 55, 65, 75, 85, np.inf],
                     labels=['0-15', '15-25', '25-35', '35-45', '45-55', '55-65', '65-75', '75-85', '85+'])

# Convert 'Age' column to a categorical data type with the specified order
age_order = ['0-15', '15-25', '25-35', '35-45', '45-55', '55-65', '65-75', '75-85', '85+']
dummy_df['Age'] = pd.Categorical(dummy_df['Age'], categories=age_order, ordered=True)

# Create a countplot
fig, ax = plt.subplots(figsize=(15, 10))
ax.set_title('Age Distribution', fontsize=20)

# Sort the age ranges for better readability
sns.countplot(x='Age', data=dummy_df, order=age_order)

ax.set_ylabel('Number', fontsize=15)
ax.set_xlabel('Age Ranges', fontsize=15)

# Add percentage values on top of the bars
total = len(dummy_df['Age'])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    x = p.get_x() + p.get_width() / 2 - 0.15
    y = p.get_height() + 10
    ax.annotate(percentage, (x, y), fontsize=12)

plt.xticks(rotation=45)
plt.show()

In [64]:
sns.countplot(data=train_df, x='Gender', hue='Spending_Score')

In [65]:
sns.countplot(data = train_df, x='Profession', hue='Graduated')
plt.xticks(rotation=45)


In [66]:
sns.countplot(data = train_df, x='Profession', hue='Spending_Score')
plt.xticks(rotation=45)


In [67]:
sns.countplot(data = train_df, x='Family_Size', hue='Spending_Score')
plt.xticks(rotation=45)


<h2>Corelation ANalysis</h2>

In [68]:
train_df.reset_index(inplace=True, drop=True)

In [69]:
train_df.head()

In [36]:
make_cat = ['Profession', 'Spending_Score']
make_dummies = ['Gender', 'Ever_Married', 'Graduated'] 

train_df = pd.get_dummies(train_df, columns=make_dummies)

for col in make_cat:
    train_df[col] =train_df[col].astype('category')

train_df[make_cat] = OrdinalEncoder().fit_transform(train_df[make_cat])
columns_to_drop = ["ID","Var_1","Segmentation"]
train_df.drop(columns_to_drop, axis=1, inplace=True)
train_df.info()

KeyError: "['ID', 'Var_1', 'Segmentation'] not found in axis"

In [35]:
train_df = train_df.astype({'Gender_Female': int, 'Gender_Male': int, "Ever_Married_No": int, "Ever_Married_Yes": int, "Graduated_No": int,"Graduated_Yes": int})
correlation_matrix = train_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', cbar=True, linewidths=.5)
plt.show()

KeyError: "Only a column name can be used for the key in a dtype mappings argument. 'Gender_Female' not found in columns."

<h2>Outlier Analysis</h2>

In [None]:
def create_outlier_charT(column_name):
    Q1 = train_df[column_name].quantile(0.25)
    Q3 = train_df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = train_df[(train_df[column_name] < lower_bound) | (train_df[column_name] > upper_bound)]
    plt.figure(figsize=(8, 6))
    plt.scatter(train_df.index, train_df[column_name], label='Data')
    plt.scatter(outliers.index, outliers[column_name], color='r', label='Outliers')
    plt.axhline(y=lower_bound, color='g', linestyle='--', label='Lower Bound')
    plt.axhline(y=upper_bound, color='g', linestyle='--', label='Upper Bound')
    plt.xlabel('Index')
    plt.ylabel(column_name)
    plt.legend()
    plt.title('Identifying and Plotting Outliers')
    plt.show()

In [None]:
create_outlier_charT("Age")

In [None]:
create_outlier_charT("Work_Experience")

In [None]:
create_outlier_charT("Spending_Score")

In [None]:
create_outlier_charT("Family_Size")

<h2>Initiate clustering</h2>

In [None]:
df = pd.read_csv("Data.csv")
df.dropna(inplace=True)
make_dummies = ['Gender', 'Ever_Married', 'Graduated'] 
make_cat = ['Profession', 'Spending_Score']
df = pd.get_dummies(df, columns=make_dummies)

for col in make_cat:
    df[col] = df[col].astype('category')

df[make_cat] = OrdinalEncoder().fit_transform(df[make_cat])
columns_to_drop = ["ID","Var_1","Segmentation"]
df.drop(columns_to_drop, axis=1, inplace=True)
df.info()

In [None]:
def remove_outliers(column_name, df):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[~((df[column_name] < lower_bound) | (df[column_name] > upper_bound))]
    return df

In [None]:
df = remove_outliers("Age", df)
df = remove_outliers("Work_Experience", df)
df = remove_outliers("Family_Size", df)


In [None]:
df.head()

<h2>K Means Clustering Elbow</h2>

In [None]:
columns_to_scale = ['Age', 'Profession', 'Work_Experience', 'Spending_Score', 'Family_Size']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('passthrough', 'passthrough', df.columns.difference(columns_to_scale).tolist())
    ])
scaled_data = preprocessor.fit_transform(df)
# df = pd.DataFrame(scaled_data, columns=df.columns)


<h3>Implement PCA</h3>

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
pc_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])


In [None]:
pc_df.head()

In [None]:
elbow(pc_df, k=15)

<h2>KMeans Visualization </h2>

In [None]:
kmeans = KMeans(n_clusters=6).fit(pc_df)
hue = [str(i) for i in kmeans.labels_]
print(silhouette_score(pc_df, kmeans.labels_))
sns.scatterplot(x=pc_df.iloc[:, 0], y=pc_df.iloc[:, 1], hue=hue)
plt.show()


<h2>Agglomorative Clustering</h2>

In [None]:
x = []
y = []
for i in range(2, 11):
    clustering = AgglomerativeClustering(n_clusters=i).fit(pc_df)
    x.append(silhouette_score(pc_df, clustering.labels_))
    y.append(i)

plt.plot(y, x)
plt.show()

In [None]:
clustering = AgglomerativeClustering(n_clusters=8).fit(pc_df)
hue = [str(i) for i in clustering.labels_]
print(silhouette_score(pc_df, clustering.labels_))
sns.scatterplot(x=pc_df.iloc[:, 0], y=pc_df.iloc[:, 1], hue=hue)
plt.show()

<h2>DBSCAN</h2>

In [None]:
epsilons = np.arange(1, 2, 0.02)
silhouettes = np.zeros(shape=(len(epsilons), 2))
index = 0
for i in epsilons:
    eps = round(i, 2)
    labels = DBSCAN(eps=eps, min_samples=20).fit(pc_df).labels_
    silhouettes[index, 0] = eps
    silhouettes[index, 1] = silhouette_score(pc_df, labels)
    index+=1

In [None]:
plt.plot(silhouettes[:, 0], silhouettes[:, 1])


In [None]:
labels = DBSCAN(eps=2, min_samples=4).fit(pc_df).labels_
hue = [str(i) for i in labels]
print(silhouette_score(pc_df, labels))
sns.scatterplot(x=pc_df.iloc[:, 0], y=pc_df.iloc[:, 1], hue=hue)
plt.show()