In [2]:
import numpy as np
import pandas as pd
# Load dataset
df = pd.read_csv(r"C:\Users\rajak\Downloads\skin_cancer_prediction_clinical_data.csv")

# numerical features for numpy analysis
features = [
    "Mole_Count",
    "Diameter_mm",
    "BMI",
]

# Convert to numpy array
X = df[features].values

print("Feature matrix shape:", X.shape)


Feature matrix shape: (3000, 3)


In [3]:
# defining the binary indicators for calculating the severity index score
severity_features = [
    "Itching",
    "Bleeding",
    "Asymmetry",
    "Border_Irregularity",
    "Color_Variation",
    "Evolving",
    "Pain",
    "Scaliness",
    "Oozing"
]
# Compute Severity Index Score using NumPy
df["Severity_Indicator_Score"] = np.sum(
    df[severity_features].values, axis=1
)
# Verifying the  severity score
print("Severity Indicator Score (first 5 patients):")
print(df[["Patient_ID", "Severity_Indicator_Score"]].head())


Severity Indicator Score (first 5 patients):
  Patient_ID  Severity_Indicator_Score
0  PID100000                         1
1  PID100001                         1
2  PID100002                         1
3  PID100003                         0
4  PID100004                         0


In [4]:

# numerical features for numpy analysis
features = [
    "Mole_Count",
    "Diameter_mm",
    "BMI",
    "Severity_Indicator_Score"
]

# Convert to numpy array
X = df[features].values

print("Feature matrix shape:", X.shape)

Feature matrix shape: (3000, 4)


In [5]:
mean_values = np.mean(X, axis=0)
std_values = np.std(X, axis=0)
min_values = np.min(X, axis=0)
max_values = np.max(X, axis=0)

print("\nBasic Numerical Statistics:\n")
for i, feature in enumerate(features):
    print(f"{feature}:")
    print(f"  Mean = {mean_values[i]}")
    print(f"  Std  = {std_values[i]}")
    print(f"  Min  = {min_values[i]}")
    print(f"  Max  = {max_values[i]}")



Basic Numerical Statistics:

Mole_Count:
  Mean = 14.981333333333334
  Std  = 3.923899194537098
  Min  = 3.0
  Max  = 32.0
Diameter_mm:
  Mean = 4.99143
  Std  = 1.9600055072456641
  Min  = -0.97
  Max  = 11.67
BMI:
  Mean = 24.855293333333336
  Std  = 4.062071250887764
  Min  = 10.87
  Max  = 38.78
Severity_Indicator_Score:
  Mean = 1.3966666666666667
  Std  = 1.077646612866306
  Min  = 0.0
  Max  = 6.0


In [7]:
# performing the distributional analysis 
percentiles = np.percentile(X, [25, 50, 75], axis=0)
print("\nPercentile Analysis:\n")
for i, feature in enumerate(features):
    print(f"{feature}:")
    print(f"  25th percentile = {percentiles[0, i]}")
    print(f"  Median (50th)   = {percentiles[1, i]}")
    print(f"  75th percentile = {percentiles[2, i]}")



Percentile Analysis:

Mole_Count:
  25th percentile = 12.0
  Median (50th)   = 15.0
  75th percentile = 18.0
Diameter_mm:
  25th percentile = 3.6275
  Median (50th)   = 4.96
  75th percentile = 6.3125
BMI:
  25th percentile = 22.185000000000002
  Median (50th)   = 24.94
  75th percentile = 27.512500000000003
Severity_Indicator_Score:
  25th percentile = 1.0
  Median (50th)   = 1.0
  75th percentile = 2.0


In [10]:
#clinical interpretability of risk groups
severity = df["Severity_Indicator_Score"].values

low_risk = severity <= 1
moderate_risk = (severity > 1) & (severity <= 3)
high_risk = severity > 3

print("\nSeverity-Based Risk Stratification:")
print("Low-risk patients:", np.sum(low_risk))
print("Moderate-risk patients:", np.sum(moderate_risk))
print("High-risk patients:", np.sum(high_risk))



Severity-Based Risk Stratification:
Low-risk patients: 1741
Moderate-risk patients: 1159
High-risk patients: 100


In [11]:
# Variance
variance = np.var(X, axis=0)
# Range (max - min)
feature_range = np.ptp(X, axis=0)
# Coefficient of Variation (CV = std / mean)
std_dev = np.std(X, axis=0)
mean_val = np.mean(X, axis=0)
cv = std_dev / mean_val
print("Variability & Dispersion Analysis:\n")
for i, feature in enumerate(features):
    print(f"{feature}:")
    print(f"  Variance = {variance[i]}")
    print(f"  Range    = {feature_range[i]}")
    print(f"  CV       = {cv[i]}\n")


Variability & Dispersion Analysis:

Mole_Count:
  Variance = 15.396984888888888
  Range    = 29.0
  CV       = 0.26191922355845704

Diameter_mm:
  Variance = 3.8416215884333336
  Range    = 12.64
  CV       = 0.3926741449335489

BMI:
  Variance = 16.50042284728889
  Range    = 27.910000000000004
  CV       = 0.16342881962451583

Severity_Indicator_Score:
  Variance = 1.1613222222222221
  Range    = 6.0
  CV       = 0.7715846870164482



In [12]:
# Symptom-related binary features
symptom_features = [
    "Itching",
    "Bleeding",
    "Asymmetry",
    "Border_Irregularity",
    "Color_Variation",
    "Evolving",
    "Pain",
    "Scaliness",
    "Oozing"
]

# Convert to NumPy array
symptom_array = df[symptom_features].values

# Symptom occurrence count
symptom_frequency = np.sum(symptom_array, axis=0)

# Percentage contribution
symptom_percentage = (symptom_frequency / symptom_array.shape[0]) * 100

print("Feature Contribution to Severity:\n")
for feature, freq, perc in zip(symptom_features, symptom_frequency, symptom_percentage):
    print(f"{feature}:")
    print(f"  Occurrence Count = {freq}")
    print(f"  Percentage Contribution = {perc:.2f}%\n")


Feature Contribution to Severity:

Itching:
  Occurrence Count = 450
  Percentage Contribution = 15.00%

Bleeding:
  Occurrence Count = 282
  Percentage Contribution = 9.40%

Asymmetry:
  Occurrence Count = 591
  Percentage Contribution = 19.70%

Border_Irregularity:
  Occurrence Count = 607
  Percentage Contribution = 20.23%

Color_Variation:
  Occurrence Count = 768
  Percentage Contribution = 25.60%

Evolving:
  Occurrence Count = 437
  Percentage Contribution = 14.57%

Pain:
  Occurrence Count = 293
  Percentage Contribution = 9.77%

Scaliness:
  Occurrence Count = 618
  Percentage Contribution = 20.60%

Oozing:
  Occurrence Count = 144
  Percentage Contribution = 4.80%



In [5]:
# Group-wise mean analysis by UV exposure
uv_group_analysis = df.groupby("UV_Exposure_Level")[
    ["Severity_Indicator_Score", "Diameter_mm"]
].mean()
print("Group-wise analysis by UV Exposure Level:\n")
print(uv_group_analysis)

Group-wise analysis by UV Exposure Level:

                   Severity_Indicator_Score  Diameter_mm
UV_Exposure_Level                                       
High                               1.331318     4.952394
Low                                1.430270     4.936843
Moderate                           1.415064     5.057756


In [6]:
location_group_analysis=df.groupby("Location")[["Severity_Indicator_Score", "Diameter_mm"]].mean()
Immune_Status_group_analysis=df.groupby("Immune_Status")[["Severity_Indicator_Score"]].mean()
print("Group-wise analysis by location Level:\n")
print(location_group_analysis)
print("Group-wise analysis by Immune_Status:\n")
print(Immune_Status_group_analysis)

Group-wise analysis by location Level:

          Severity_Indicator_Score  Diameter_mm
Location                                       
Arms                      1.339960     4.918946
Back                      1.354049     5.061299
Chest                     1.393305     5.047155
Face                      1.379958     4.803653
Legs                      1.449393     5.013745
Scalp                     1.464078     5.091709
Group-wise analysis by Immune_Status:

               Severity_Indicator_Score
Immune_Status                          
Normal                         1.402208
Suppressed                     1.366379
