In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv('Test.csv')
test_df = pd.read_csv('Train.csv')
train_df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6,B
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6,A
2,458996,Female,Yes,69,No,,0.0,Low,1.0,Cat_6,A
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6,B
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,B
2623,467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,A
2624,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6,C
2625,467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,C


In [3]:
# Drop rows with missing values
train_df_cleaned = train_df.dropna()

# Split data into features (X) and target (y)

# train_df=train_df.drop(['ID','Var_1'],axis=1)
train_df_cleaned=train_df.drop(columns=["ID"])
# X = train_df_cleaned.drop(columns=["Segmentation"])  # Features
# y = train_df_cleaned["Segmentation"]  # Target variable



In [4]:
# label_encoders = {}
# for col in X.select_dtypes(include=["object"]).columns:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])  # Fit and transform training data
#     label_encoders[col] = le  # Store the encoder for future use


In [6]:
label_encoders = {}
for col in train_df_cleaned.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    train_df_cleaned[col] = le.fit_transform(train_df_cleaned[col])  # Fit and transform training data
    label_encoders[col] = le  # Store the encoder for future use

In [7]:
X = train_df_cleaned.drop(columns=["Segmentation"])  # Features
y = train_df_cleaned["Segmentation"]  # Target variable

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rfc.fit(X_train, y_train)

In [10]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = rfc.predict(X_test)
# Calculate Accuracy
accuracy_test = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_test:.4f}")

# Detailed Classification Report

# Get prediction probabilities
probs_test = rfc.predict_proba(X_test)

# Confidence = max probability among all classes
probs_test.max(axis=1)



Accuracy: 0.3384


array([0.42      , 0.71      , 0.63416667, 0.66      , 0.42      ,
       0.33      , 0.33      , 0.55      , 0.35      , 0.52733333,
       0.4       , 0.42      , 0.51      , 0.34      , 0.81      ,
       0.41      , 0.4925    , 0.424     , 0.55      , 0.29      ,
       0.63      , 0.3       , 0.78      , 0.61      , 0.35      ,
       0.4       , 0.47      , 0.91      , 0.774     , 0.61      ,
       0.34      , 0.69      , 0.33333333, 0.5       , 0.35      ,
       0.48      , 0.42666667, 0.36      , 0.335     , 0.63      ,
       0.55      , 0.325     , 0.63233333, 0.42      , 0.39      ,
       0.48      , 0.38      , 0.4325    , 0.76333333, 0.53733333,
       0.32      , 0.46      , 0.4125    , 0.308     , 0.36      ,
       0.40333333, 0.648     , 0.54333333, 0.6475    , 0.3525    ,
       0.35      , 0.56      , 0.50333333, 0.45016667, 0.47      ,
       0.56      , 0.38      , 0.37      , 0.405     , 0.39      ,
       0.325     , 0.36      , 0.56      , 0.74683333, 0.47   

In [11]:
y_pred_train = rfc.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy: {accuracy_train:.4f}")
probs_train = rfc.predict_proba(X_train)

# Confidence = max probability among all classes
probs_train.max(axis=1)




Accuracy: 0.9814


array([0.6325    , 0.605     , 0.84      , ..., 0.7       , 0.63083333,
       0.84      ])

In [14]:
# Function to add noise to confidence scores
import numpy as np
def add_noise(probabilities, confidence_scores):
    noisy_probs = np.copy(probabilities)  # Copy to avoid modifying original array

    for i in range(len(confidence_scores)):
        if confidence_scores[i] >= 0.8:
            noise = np.random.laplace(0, 0.5, size=probabilities.shape[1])  # High noise
        elif 0.5 <= confidence_scores[i] < 0.8:
            noise = np.random.normal(0, 0.2, size=probabilities.shape[1])  # Moderate noise
        else:
            noise = np.random.exponential(0.1, size=probabilities.shape[1])  # Low noise

        noisy_probs[i] += noise  # Add noise
        noisy_probs[i] = np.maximum(noisy_probs[i], 0)  # Ensure non-negative
        noisy_probs[i] /= noisy_probs[i].sum()  # Normalize to sum to 1

    return noisy_probs

# Apply noise to test set predictions

confidence_scores_test = probs_test.max(axis=1)
noisy_probabilities_test = add_noise(probs_test, confidence_scores_test)

# Get final predictions after noise
y_pred_noisy = np.argmax(noisy_probabilities_test, axis=1)

# Compute accuracy after adding noise
noisy_accuracy_test = accuracy_score(y_test, y_pred_noisy)
print(f"Accuracy After Applying Differential Privacy: {noisy_accuracy_test:.4f}")

# Compare with baseline
accuracy_drop = accuracy_test - noisy_accuracy_test
print(f"Accuracy Drop: {accuracy_drop:.4f}")

Accuracy After Applying Differential Privacy: 0.3061
Accuracy Drop: 0.0323


In [16]:
confidence_scores_train = probs_train.max(axis=1)
noisy_probabilities_train = add_noise(probs_train, confidence_scores_train)

# Get final predictions after noise
y_pred_noisy_train = np.argmax(noisy_probabilities_train, axis=1)
print(y_pred_noisy_train)
print(y_train)
# Compute accuracy after adding noise
noisy_accuracy_train = accuracy_score(y_train, y_pred_noisy_train)
print(f"Accuracy After Applying Differential Privacy: {noisy_accuracy_train:.4f}")

# Compare with baseline
print(accuracy_train,noisy_accuracy_train)
accuracy_drop = accuracy_train - noisy_accuracy_train
print(f"Accuracy Drop: {accuracy_drop:.4f}")

[1 1 1 ... 1 1 0]
37      1
934     1
2555    0
1710    0
757     0
       ..
1893    1
167     2
445     1
253     1
1914    3
Name: Segmentation, Length: 2101, dtype: int64
Accuracy After Applying Differential Privacy: 0.8648
0.9814374107567825 0.8648262732032366
Accuracy Drop: 0.1166


  noisy_probs[i] /= noisy_probs[i].sum()  # Normalize to sum to 1
