In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

In [2]:

df = pd.read_csv('Train.csv')

In [4]:


# Drop rows with missing values
df_cleaned = df.dropna()

# Split data into features (X) and target (y)

df_cleaned=df_cleaned.drop(['ID'],axis=1)
X = df_cleaned.drop(columns=["Segmentation"])  # Features
y = df_cleaned["Segmentation"]  # Target variable

# Perform train-test split (80-20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check new dataset sizes
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((5332, 9), (1333, 9), (5332,), (1333,))

In [4]:
label_encoders = {}
for col in X_train.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])  # Fit and transform training data
    label_encoders[col] = le  # Store the encoder for future use

# Apply the same encoding to test data
for col in X_val.select_dtypes(include=["object"]).columns:
    if col in label_encoders:  # Ensure the same encoder is used
        X_val[col] = label_encoders[col].transform(X_val[col])

if y_train.dtype == "object":
    y_train = le.fit_transform(y_train)  # Transform the target column
    y_val = le.transform(y_val)  # Ensure the same encoding is used for validation data
label_encoders

{'Gender': LabelEncoder(),
 'Ever_Married': LabelEncoder(),
 'Graduated': LabelEncoder(),
 'Profession': LabelEncoder(),
 'Spending_Score': LabelEncoder(),
 'Var_1': LabelEncoder()}

In [5]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc.fit(X_train, y_train)

In [6]:
y_pred = rfc.predict(X_val)
y_pred

array([0, 3, 2, 0, 0, 0, 2, 0, 3, 3, 1, 3, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0,
       1, 0, 3, 3, 0, 0, 0, 3, 0, 1, 3, 3, 1, 0, 2, 1, 3, 3, 0, 0, 3, 2,
       0, 2, 2, 2, 3, 2, 0, 0, 0, 3, 3, 3, 1, 0, 0, 3, 3, 0, 0, 3, 3, 3,
       2, 3, 3, 1, 3, 0, 0, 2, 0, 0, 0, 0, 3, 1, 3, 0, 3, 1, 2, 2, 0, 1,
       0, 2, 1, 0, 0, 3, 3, 1, 0, 3, 0, 1, 3, 0, 3, 0, 1, 1, 3, 1, 3, 3,
       2, 2, 3, 0, 3, 2, 0, 0, 0, 3, 3, 3, 0, 3, 1, 2, 1, 1, 0, 3, 1, 0,
       1, 3, 1, 3, 0, 0, 2, 0, 0, 0, 1, 0, 3, 0, 0, 1, 1, 0, 3, 3, 0, 1,
       0, 0, 0, 0, 0, 3, 1, 1, 2, 3, 2, 1, 1, 0, 1, 0, 0, 3, 3, 3, 2, 0,
       1, 2, 0, 0, 0, 3, 0, 3, 3, 0, 0, 2, 3, 0, 3, 2, 0, 2, 0, 0, 3, 3,
       0, 3, 2, 1, 0, 3, 0, 0, 1, 0, 0, 0, 3, 2, 0, 0, 1, 3, 3, 1, 0, 1,
       1, 0, 1, 3, 0, 3, 3, 2, 0, 3, 0, 2, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       2, 2, 3, 0, 3, 2, 3, 3, 0, 0, 0, 1, 3, 0, 0, 0, 3, 0, 0, 0, 1, 0,
       3, 1, 0, 2, 3, 3, 0, 1, 1, 3, 0, 0, 2, 0, 1, 3, 2, 0, 3, 3, 2, 0,
       3, 3, 0, 2, 0, 0, 1, 0, 1, 3, 0, 0, 3, 0, 1,

In [7]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate Accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Detailed Classification Report
print("Classification Report:\n", classification_report(y_val, y_pred))
# Get prediction probabilities
probs = rfc.predict_proba(X_val)

# Confidence = max probability among all classes
confidence = probs.max(axis=1)

# Print first 5 confidence scores
print(confidence[:5])

Accuracy: 0.3364
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.42      0.37       139
           1       0.27      0.24      0.26        90
           2       0.27      0.18      0.22        76
           3       0.43      0.40      0.41       126

    accuracy                           0.34       431
   macro avg       0.32      0.31      0.31       431
weighted avg       0.33      0.34      0.33       431

[0.54 0.54 0.42 0.34 0.28]


In [8]:
print(X_train['Profession'].value_counts())


Profession
0    566
5    261
3    205
1    175
2    149
7    139
4    115
8     73
6     40
Name: count, dtype: int64


In [9]:
y_pred = rfc.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Accuracy: {accuracy:.4f}")
probs = rfc.predict_proba(X_train)

# Confidence = max probability among all classes
confidence = probs.max(axis=1)

# Print first 5 confidence scores
print(confidence)

Accuracy: 1.0000
[0.74 0.62 0.62 0.78 0.79]


In [10]:
len(train_df)

2627

In [11]:
X_train.corr()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
ID,1.0,0.005971,0.057937,0.000509,0.00234,-0.021842,-0.017901,-0.032916,0.016408,-0.029881
Gender,0.005971,1.0,0.110279,-0.017965,-0.070051,0.055075,-0.067984,-0.061581,0.078752,0.020661
Ever_Married,0.057937,0.110279,1.0,0.585226,0.201802,-0.097406,-0.130992,-0.615285,-0.095562,0.074946
Age,0.000509,-0.017965,0.585226,1.0,0.260847,0.072337,-0.188712,-0.301206,-0.291188,0.188204
Graduated,0.00234,-0.070051,0.201802,0.260847,1.0,-0.33124,0.046856,-0.166277,-0.299888,0.162347
Profession,-0.021842,0.055075,-0.097406,0.072337,-0.33124,1.0,-0.03362,0.213872,0.11003,-0.022067
Work_Experience,-0.017901,-0.067984,-0.130992,-0.188712,0.046856,-0.03362,1.0,0.083217,-0.081749,0.013305
Spending_Score,-0.032916,-0.061581,-0.615285,-0.301206,-0.166277,0.213872,0.083217,1.0,-0.100159,-0.023436
Family_Size,0.016408,0.078752,-0.095562,-0.291188,-0.299888,0.11003,-0.081749,-0.100159,1.0,-0.199359
Var_1,-0.029881,0.020661,0.074946,0.188204,0.162347,-0.022067,0.013305,-0.023436,-0.199359,1.0


In [12]:
y_pred = rfc.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Accuracy: {accuracy:.4f}")
probs = rfc.predict_proba(X_train)

# Confidence = max probability among all classes
confidence = probs.max(axis=1)

# Print first 5 confidence scores
print(confidence)