In [2]:
import pandas as pd

# Load the training and testing data
train_df = pd.read_csv("X_train_labeled.csv")
test_df = pd.read_csv("X_test_labeled.csv")

# Separate features (X) and labels (y)
X_train = train_df.drop("diagnosis", axis=1)
y_train = train_df["diagnosis"]

X_test = test_df.drop("diagnosis", axis=1)
y_test = test_df["diagnosis"]

  test_df = pd.read_csv("X_test_labeled.csv")


In [4]:
# Check columns with non-numeric values
non_numeric_cols = test_df.select_dtypes(exclude=["float", "int"]).columns
print("🧪 Non-numeric columns:", list(non_numeric_cols))

# See sample values from one of them
if len(non_numeric_cols) > 0:
    print(test_df[non_numeric_cols[0]].unique()[:10])  # See first 10 unique values

🧪 Non-numeric columns: ['diagnosis']
[nan 'pro' 'early' 'benign' 'pre']


In [7]:
print("NaNs in X_train:", X_train.isnull().sum().sum())
print("NaNs in y_train:", y_train.isnull().sum())

NaNs in X_train: 0
NaNs in y_train: 520


In [9]:
# Drop rows in X_train and y_train where y_train is NaN
nan_mask = y_train.notna()
X_train = X_train[nan_mask]
y_train = y_train[nan_mask]

In [11]:
print("NaNs in y_train after cleaning:", y_train.isnull().sum())

NaNs in y_train after cleaning: 0


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [16]:
print("NaNs in y_test:", y_test.isnull().sum())

NaNs in y_test: 524


In [18]:
nan_mask = y_test.notna()
X_test = X_test[nan_mask]
y_test = y_test[nan_mask]

In [20]:
print("y_test sample values:", y_test.unique())
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)

y_test sample values: ['pro' 'early' 'benign' 'pre']
y_test shape: (128,)
y_pred shape: (652,)


In [22]:
# Recalculate predictions after cleaning
y_pred = rf_model.predict(X_test)

# Now both shapes will match (128,)

In [24]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict
y_pred = rf_model.predict(X_test)

# Accuracy
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print("\n🧾 Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Accuracy: 0.2578125

🧾 Classification Report:
               precision    recall  f1-score   support

      benign       0.00      0.00      0.00        22
       early       0.22      0.42      0.29        33
         pre       0.29      0.44      0.35        32
         pro       0.33      0.12      0.18        41

    accuracy                           0.26       128
   macro avg       0.21      0.25      0.20       128
weighted avg       0.24      0.26      0.22       128


📊 Confusion Matrix:
 [[ 0 13  6  3]
 [ 0 14 14  5]
 [ 0 16 14  2]
 [ 0 22 14  5]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
print(y_test.sample(5))
# Output: 
# 0    pro
# 1    early
# 2    benign

93        pro
292    benign
565       pro
341       pro
270       pro
Name: diagnosis, dtype: object


In [28]:
from sklearn.decomposition import PCA

# Retain 95% of variance with fewer components
pca = PCA(n_components=0.95, svd_solver='full')

# Apply PCA to training and testing sets
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("Original feature count:", X_train.shape[1])
print("Reduced feature count:", X_train_pca.shape[1])

Original feature count: 50176
Reduced feature count: 1043


In [30]:
rf_model = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
rf_model.fit(X_train_pca, y_train)

In [32]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = rf_model.predict(X_test_pca)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

✅ Accuracy: 0.2578125
              precision    recall  f1-score   support

      benign       0.00      0.00      0.00        22
       early       0.26      0.55      0.35        33
         pre       0.29      0.44      0.35        32
         pro       0.11      0.02      0.04        41

    accuracy                           0.26       128
   macro avg       0.16      0.25      0.18       128
weighted avg       0.17      0.26      0.19       128

[[ 0 13  7  2]
 [ 0 18 11  4]
 [ 0 16 14  2]
 [ 0 23 17  1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
print(y_train.value_counts())

diagnosis
early     631
pre       614
pro       517
benign    322
Name: count, dtype: int64


In [36]:
!pip install imblearn


Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [38]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the PCA-transformed training set
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_pca, y_train)

# Check new class distribution
print("✅ After SMOTE:\n", y_train_balanced.value_counts())

✅ After SMOTE:
 diagnosis
pre       631
benign    631
early     631
pro       631
Name: count, dtype: int64


In [40]:
rf_model = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

In [42]:
# Train with SMOTE-balanced training data
rf_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',  # Still helpful even with SMOTE
    random_state=42
)

rf_model.fit(X_train_balanced, y_train_balanced)

In [44]:
y_pred = rf_model.predict(X_test_pca)

In [46]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# Classification Report
print("\n🧾 Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Accuracy: 0.234375

🧾 Classification Report:
               precision    recall  f1-score   support

      benign       0.13      0.14      0.13        22
       early       0.22      0.30      0.26        33
         pre       0.20      0.19      0.19        32
         pro       0.37      0.27      0.31        41

    accuracy                           0.23       128
   macro avg       0.23      0.22      0.22       128
weighted avg       0.25      0.23      0.24       128


📊 Confusion Matrix:
 [[ 3 12  3  4]
 [ 6 10 10  7]
 [ 6 12  6  8]
 [ 8 11 11 11]]


In [48]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}

# Set up GridSearch
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Run GridSearch on balanced PCA-reduced training data
grid_search.fit(X_train_balanced, y_train_balanced)

# Get best model and params
best_model = grid_search.best_estimator_
print("✅ Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
✅ Best Parameters: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [50]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict using the tuned model
y_pred = best_model.predict(X_test_pca)

# Evaluate
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Accuracy: 0.1875

📄 Classification Report:
               precision    recall  f1-score   support

      benign       0.04      0.05      0.04        22
       early       0.16      0.21      0.18        33
         pre       0.22      0.25      0.23        32
         pro       0.33      0.20      0.25        41

    accuracy                           0.19       128
   macro avg       0.19      0.18      0.18       128
weighted avg       0.21      0.19      0.19       128


📊 Confusion Matrix:
 [[ 1 10  8  3]
 [ 7  7 10  9]
 [ 6 14  8  4]
 [ 9 13 11  8]]


In [52]:
rf_model = RandomForestClassifier(
    class_weight='balanced',
    max_depth=20,
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200,
    random_state=42
)

rf_model.fit(X_train_pca, y_train)

In [54]:
y_pred = rf_model.predict(X_test_pca)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))

✅ Accuracy: 0.2890625

📄 Classification Report:
               precision    recall  f1-score   support

      benign       0.00      0.00      0.00        22
       early       0.25      0.55      0.34        33
         pre       0.32      0.44      0.37        32
         pro       0.42      0.12      0.19        41

    accuracy                           0.29       128
   macro avg       0.25      0.28      0.22       128
weighted avg       0.28      0.29      0.24       128



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
