<a href="https://colab.research.google.com/github/qasimzee/isic2024/blob/main/isic_2024_challenge_table_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!ls "/content/drive/MyDrive/kaggle/isic-2024-data"

cnn.pkl			 test-image.hdf5	   train-image.hdf5
random_forest_model.pkl  test-metadata.csv	   train-metadata.csv
sample_submission.csv	 test-metadata.csv.gsheet  train-metadata-v1.csv


In [5]:
METADATA_FILE = "/content/drive/MyDrive/kaggle/isic-2024-data/train-metadata.csv"
metadata_df = pd.read_csv(METADATA_FILE)


  metadata_df = pd.read_csv(METADATA_FILE)


In [6]:
train_df, test_df = train_test_split(metadata_df, test_size=0.2, random_state=42)

train_df


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
18085,ISIC_0517312,0,IP_5299455,60.0,female,posterior torso,2.86,TBP tile: close-up,3D: XP,20.465960,...,,Benign,Benign,,,,,,,99.739540
171181,ISIC_4318651,0,IP_3923321,70.0,female,anterior torso,2.61,TBP tile: close-up,3D: white,19.067292,...,,Benign,Benign,,,,,,,99.990857
284095,ISIC_7110993,0,IP_1544679,55.0,male,posterior torso,2.79,TBP tile: close-up,3D: XP,24.721678,...,,Benign,Benign,,,,,,,100.000000
354900,ISIC_8861121,0,IP_1116526,80.0,male,lower extremity,6.68,TBP tile: close-up,3D: XP,16.342123,...,,Benign,Benign,,,,,,,95.420188
77679,ISIC_1994958,0,IP_5067302,70.0,male,posterior torso,3.63,TBP tile: close-up,3D: XP,20.388077,...,,Benign,Benign,,,,,,,99.998164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,ISIC_6502755,0,IP_7965642,60.0,male,upper extremity,3.94,TBP tile: close-up,3D: XP,14.689290,...,,Benign,Benign,,,,,,,94.829340
365838,ISIC_9129069,0,IP_2889176,45.0,female,posterior torso,2.74,TBP tile: close-up,3D: XP,24.387236,...,,Benign,Benign,,,,,,,100.000000
131932,ISIC_3342513,0,IP_9577633,50.0,male,lower extremity,3.26,TBP tile: close-up,3D: XP,18.711190,...,,Benign,Benign,,,,,,,99.980790
146867,ISIC_3720179,0,IP_3751225,75.0,male,anterior torso,2.60,TBP tile: close-up,3D: XP,16.110687,...,,Benign,Benign,,,,,,,99.995995


In [7]:

X = train_df.drop(columns=["isic_id", "target", "patient_id", "lesion_id"])
Y = train_df["target"]

# Replace NaN values
X['sex'] = X['sex'].replace(pd.NA, 'unknown')
X['anatom_site_general'] = X['anatom_site_general'].replace(pd.NA, 'unknown')
# X['lesion_id'] = X['lesion_id'].replace(pd.NA, 'unknown')
X['iddx_2'] = X['iddx_2'].replace(pd.NA, 'unknown')
X['iddx_3'] = X['iddx_3'].replace(pd.NA, 'unknown')
X['iddx_4'] = X['iddx_4'].replace(pd.NA, 'unknown')
X['iddx_5'] = X['iddx_5'].replace(pd.NA, 'unknown')
X['mel_mitotic_index'] = X['mel_mitotic_index'].replace(pd.NA, 'unknown')


X.fillna(0, inplace=True)

In [8]:
# Identify columns with mixed types

for column in X.columns:
  if X[column].apply(type).nunique() > 1:
    print(f"{column}: {X[column].apply(type).unique()}")


In [9]:
class SafeLabelEncoder(LabelEncoder):
    def transform(self, y):
        # Map unseen labels to -1 or a specific default value
        unknown_label = -1
        transformed = np.array([self.classes_.tolist().index(x) if x in self.classes_ else unknown_label for x in y])
        return transformed

label_encoders = {}

for column in X.select_dtypes(include=["object"]):
    le = SafeLabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LogisticRegression()
model.fit(X_scaled, Y)


In [10]:
test_df['sex'] = test_df['sex'].replace(pd.NA, 'unknown')
test_df['anatom_site_general'] = test_df['anatom_site_general'].replace(pd.NA, 'unknown')
test_df['lesion_id'] = test_df['lesion_id'].replace(pd.NA, 'unknown')
test_df['iddx_2'] = test_df['iddx_2'].replace(pd.NA, 'unknown')
test_df['iddx_3'] = test_df['iddx_3'].replace(pd.NA, 'unknown')
test_df['iddx_4'] = test_df['iddx_4'].replace(pd.NA, 'unknown')
test_df['mel_mitotic_index'] = test_df['mel_mitotic_index'].replace(pd.NA, 'unknown')

test_df.fillna(0, inplace=True)
# Identify columns with mixed types
for column in test_df.columns:
  if test_df[column].apply(type).nunique() > 1:
    print(f"{column}: {test_df[column].unique()}")


test_isic_ids = test_df['isic_id']
test_df = test_df.drop(columns=["isic_id", "target", "patient_id", "lesion_id"])

In [11]:

# Preprocess the test data just like the training data
for column in test_df.select_dtypes(include=['object']).columns:
    if column not in ["isic_id", "target", "patient_id"]:  # Make sure we don't encode 'isic_id'
        test_df[column] = label_encoders[column].transform(test_df[column])

# Drop 'isic_id' before scaling
test_df_scaled = scaler.transform(test_df)

# Generate predictions
test_predictions = model.predict_proba(test_df_scaled)[:, 1]

# Create the submission DataFrame, using the saved 'isic_id'
submission_df = pd.DataFrame({
    'isic_id': test_isic_ids,
    'target': test_predictions
})

# Save to CSV
submission_df #.to_csv('submission.csv', index=False)




Unnamed: 0,isic_id,target
278442,ISIC_6973879,4.680929e-09
215021,ISIC_5407194,5.582545e-09
209685,ISIC_5273739,7.624672e-09
29648,ISIC_0802250,7.834839e-09
323386,ISIC_8084953,8.410781e-09
...,...,...
2741,ISIC_0139843,6.350302e-09
239797,ISIC_6027282,7.948451e-09
243397,ISIC_6115419,7.374003e-09
86684,ISIC_2218691,5.944593e-09


In [12]:
import pickle
from sklearn.linear_model import LogisticRegression

# Save the model to a pickle file
with open('/content/drive/MyDrive/kaggle/isic-2024-data/logistic_regression.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully!")


Model saved successfully!
