# EDA and Explainability
This notebook loads the synthetic dataset, does basic EDA, trains (or loads) the model, and shows feature importance plots.

In [61]:
import pandas as pd
import numpy as np
from pathlib import Path

from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path("C:/Users/shinejose/Downloads/sovanta_streamlit_app_fixed2")
data_path = PROJECT_ROOT / "data" / "synthetic_customers.csv"

df = pd.read_csv(data_path)
print(df.head())



# -----------------------------
# Basic EDA with NumPy
# -----------------------------
print("Dataset summary (mean, std, min, max):")
summary = df.describe()
print(summary)

print("\nTarget value counts:")
print(df['target_highvalue'].value_counts())

# -----------------------------
# Histogram using NumPy
# -----------------------------
age_data = df['age'].to_numpy()
hist_bins = 10
hist_counts, bin_edges = np.histogram(age_data, bins=hist_bins)

print("\nAge histogram bins:", bin_edges)
print("Age histogram counts:", hist_counts)

# -----------------------------
# Feature importance example
# -----------------------------
# If you have a RandomForest or similar model:
# feat_names = ['age','years_experience','education','salary']
# importances = clf.feature_importances_  # numpy array
# print("Feature importances:")
# for f, imp in zip(feat_names, importances):
#     print(f"{f}: {imp:.3f}")


   age  years_experience  education  salary  target_highvalue
0   20                 3          2   32236                 1
1   23                 1          1   31317                 1
2   23                 0          2   36206                 1
3   59                18          3   81300                 1
4   29                 8          2   51883                 1
Dataset summary (mean, std, min, max):
              age  years_experience   education         salary  \
count  800.000000        800.000000  800.000000     800.000000   
mean    39.390000         10.386250    1.843750   53440.037500   
std     11.767689          6.177966    0.774652   17610.488861   
min     20.000000          0.000000    1.000000    2223.000000   
25%     29.000000          5.000000    1.000000   39855.250000   
50%     39.000000         10.000000    2.000000   53371.500000   
75%     50.000000         16.000000    2.000000   65886.250000   
max     59.000000         23.000000    3.000000  102928.00000

In [63]:
df.describe()

Unnamed: 0,age,years_experience,education,salary,target_highvalue
count,800.0,800.0,800.0,800.0,800.0
mean,39.39,10.38625,1.84375,53440.0375,1.0
std,11.767689,6.177966,0.774652,17610.488861,0.0
min,20.0,0.0,1.0,2223.0,1.0
25%,29.0,5.0,1.0,39855.25,1.0
50%,39.0,10.0,2.0,53371.5,1.0
75%,50.0,16.0,2.0,65886.25,1.0
max,59.0,23.0,3.0,102928.0,1.0


In [65]:
import numpy as np

# Histogram of 'age' using NumPy
ages = df['age'].to_numpy()
hist, bin_edges = np.histogram(ages, bins=10)  # 10 bins
print("Bin edges:", bin_edges)
print("Counts:", hist)

# Optional: simple text-based histogram
for i in range(len(hist)):
    print(f"{bin_edges[i]:.0f}-{bin_edges[i+1]:.0f}: {'#'*hist[i]}")


Bin edges: [20.  23.9 27.8 31.7 35.6 39.5 43.4 47.3 51.2 55.1 59. ]
Counts: [90 85 75 69 82 76 79 79 82 83]
20-24: ##########################################################################################
24-28: #####################################################################################
28-32: ###########################################################################
32-36: #####################################################################
36-40: ##################################################################################
40-43: ############################################################################
43-47: ###############################################################################
47-51: ###############################################################################
51-55: ##################################################################################
55-59: ###################################################################################


In [67]:
# Train a quick RandomForest and show feature_importances_
import joblib, os
model_path = os.path.join('models','model.joblib')
if os.path.exists(model_path):
    clf = joblib.load(model_path)
    import numpy as np
    feat_names = ['age','years_experience','education','salary']
    import matplotlib.pyplot as plt
    plt.figure()
    plt.bar(feat_names, clf.feature_importances_)
    plt.title('Feature importances')
    plt.show()
else:
    print('Model not found; run src.train_model to train the model first.')


Model not found; run src.train_model to train the model first.


In [69]:
import numpy as np

feat_importances = np.array([0.2, 0.5, 0.1, 0.2])
feat_names = ['age','experience','education','salary']

for name, val in zip(feat_names, feat_importances):
    print(f"{name}: {'#'*int(val*50)} ({val:.2f})")


age: ########## (0.20)
experience: ######################### (0.50)
education: ##### (0.10)
salary: ########## (0.20)
