In [3]:
# install sklearn
!pip install scikit-learn



In [1]:
import pandas as pd

# Load the metadata file
metadata_path = 'HAM10000_metadata.csv'
metadata = pd.read_csv(metadata_path)

# Display the first few rows of the metadata
metadata.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [2]:
lesion_types = metadata['dx'].value_counts()
lesion_types

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

### Model with 8x8 images

In [4]:
# Load the 8x8 grayscale images dataset
images_8x8_path = 'hmnist_8_8_L.csv'
images_8x8 = pd.read_csv(images_8x8_path)

# Display the first few rows of the image data
images_8x8.head()

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel0055,pixel0056,pixel0057,pixel0058,pixel0059,pixel0060,pixel0061,pixel0062,pixel0063,label
0,172,182,191,183,180,181,165,164,173,192,...,159,171,181,201,192,184,183,171,157,2
1,98,149,170,193,183,162,164,100,137,175,...,135,83,159,186,185,192,181,143,58,2
2,165,164,179,172,152,163,169,151,168,174,...,169,152,171,185,189,193,176,168,151,2
3,109,159,167,166,163,159,155,96,141,168,...,101,79,146,170,167,158,154,133,45,2
4,173,202,210,194,208,248,243,205,180,142,...,230,160,199,206,208,209,205,200,186,2


In [16]:
# Mapping the numerical labels to actual lesion types
# The 'label' column in the images dataset corresponds to the 'dx' column in the metadata

# Create a mapping from the 'dx' column to the numerical labels
label_mapping = dict(zip(metadata['dx'].unique(), range(len(metadata['dx'].unique()))))

# Reverse the mapping for applying to the image dataset
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Apply the mapping to the 'label' column in the images dataset
images_8x8['lesion_type'] = images_8x8['label'].map(reverse_label_mapping)

# Display the first few rows with the new column
images_8x8[['label', 'lesion_type']].head()


Unnamed: 0,label,lesion_type
0,2,df
1,2,df
2,2,df
3,2,df
4,2,df


In [17]:
# Define a mapping for cancerous and non-cancerous lesions
cancerous_mapping = {
    'mel': 1,    # Melanoma
    'bcc': 1,    # Basal cell carcinoma
    'akiec': 1,  # Actinic keratoses and intraepithelial carcinoma / Bowen's disease
    'nv': 0,     # Nevus
    'bkl': 0,    # Benign keratosis
    'vasc': 0,   # Vascular lesions
    'df': 0      # Dermatofibroma
}

# Apply the mapping to categorize lesions as cancerous or non-cancerous
images_8x8['cancerous'] = images_8x8['lesion_type'].map(cancerous_mapping)

# Display the first few rows with the new column
images_8x8[['lesion_type', 'cancerous']].head()


Unnamed: 0,lesion_type,cancerous
0,df,0
1,df,0
2,df,0
3,df,0
4,df,0


In [28]:
from sklearn.model_selection import train_test_split

# Extracting image data (pixels) and labels (cancerous or not)
X = images_8x8.drop(['label', 'lesion_type', 'cancerous'], axis=1)
y = images_8x8['cancerous']

# Splitting the data into training, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalizing the pixel values
X_train = X_train / 255.0
X_val = X_val / 255.0
X_test = X_test / 255.0

# Map the lesion types to 'cancerous' or 'non-cancerous'
images_8x8['cancerous'] = images_8x8['lesion_type'].map(cancerous_mapping)

# Extracting image data (pixels) and labels (cancerous or not)
X_8x8 = images_8x8.drop(['label', 'lesion_type', 'cancerous'], axis=1)
y_8x8 = images_8x8['cancerous']

# Splitting the data into training, validation, and test sets (70%, 15%, 15%)
X_train_8, X_temp_8, y_train_8, y_temp_8 = train_test_split(X_8x8, y_8x8, test_size=0.3, random_state=42)
X_val_8, X_test_8, y_val_8, y_test_8 = train_test_split(X_temp_8, y_temp_8, test_size=0.5, random_state=42)

# Normalizing the pixel values
X_train_8 = X_train_8 / 255.0
X_val_8 = X_val_8 / 255.0
X_test_8 = X_test_8 / 255.0

# Checking the shape of the datasets
X_train_8.shape, X_val_8.shape, X_test_8.shape, y_train_8.shape, y_val_8.shape, y_test_8.shape

((7010, 64), (1502, 64), (1503, 64), (7010,), (1502,), (1503,))

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Defining and training the logistic regression model on the 28x28 dataset
log_reg_model_8 = LogisticRegression(max_iter=1000)
log_reg_model_8.fit(X_train_8, y_train_8)

# Making predictions on the test set
y_pred_8 = log_reg_model_8.predict(X_test_8)

# Evaluating the model
accuracy_8 = accuracy_score(y_test_8, y_pred_8)
report_8 = classification_report(y_test_8, y_pred_8)

print(accuracy_8)
print(report_8)


0.8596141051230871
              precision    recall  f1-score   support

           0       0.86      1.00      0.92      1290
           1       0.67      0.02      0.04       213

    accuracy                           0.86      1503
   macro avg       0.76      0.51      0.48      1503
weighted avg       0.83      0.86      0.80      1503



### Model with 28x28 images

In [21]:
# Load the 28x28 grayscale images dataset
images_28x28_path = 'hmnist_28_28_L.csv'
images_28x28 = pd.read_csv(images_28x28_path)

# Display the first few rows of the image data
images_28x28.head()


Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel0775,pixel0776,pixel0777,pixel0778,pixel0779,pixel0780,pixel0781,pixel0782,pixel0783,label
0,169,171,170,177,181,182,181,185,194,192,...,184,186,185,180,157,140,140,159,165,2
1,19,57,105,140,149,148,144,155,170,170,...,172,175,160,144,114,89,47,18,18,2
2,155,163,161,167,167,172,155,152,165,175,...,163,178,157,166,167,148,141,136,115,2
3,25,71,116,139,136,153,148,161,172,162,...,125,135,138,137,111,71,32,16,16,2
4,129,162,181,196,205,208,205,213,225,224,...,210,197,172,190,195,193,181,147,88,2


In [22]:
# Mapping the numerical labels to actual lesion types
# The 'label' column in the images dataset corresponds to the 'dx' column in the metadata

# Create a mapping from the 'dx' column to the numerical labels
label_mapping = dict(zip(metadata['dx'].unique(), range(len(metadata['dx'].unique()))))

# Reverse the mapping for applying to the image dataset
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Apply the mapping to the 'label' column in the images dataset
images_28x28['lesion_type'] = images_28x28['label'].map(reverse_label_mapping)

# Display the first few rows with the new column
images_28x28[['label', 'lesion_type']].head()


Unnamed: 0,label,lesion_type
0,2,df
1,2,df
2,2,df
3,2,df
4,2,df


In [24]:
# Define a mapping for cancerous and non-cancerous lesions
cancerous_mapping = {
    'mel': 1,    # Melanoma
    'bcc': 1,    # Basal cell carcinoma
    'akiec': 1,  # Actinic keratoses and intraepithelial carcinoma / Bowen's disease
    'nv': 0,     # Nevus
    'bkl': 0,    # Benign keratosis
    'vasc': 0,   # Vascular lesions
    'df': 0      # Dermatofibroma
}

# Apply the mapping to categorize lesions as cancerous or non-cancerous
images_28x28['cancerous'] = images_28x28['lesion_type'].map(cancerous_mapping)

# Display the first few rows with the new column
images_28x28[['lesion_type', 'cancerous']].head()

Unnamed: 0,lesion_type,cancerous
0,df,0
1,df,0
2,df,0
3,df,0
4,df,0


In [25]:
from sklearn.model_selection import train_test_split

# Extracting image data (pixels) and labels (cancerous or not)
X = images_28x28.drop(['label', 'lesion_type', 'cancerous'], axis=1)
y = images_28x28['cancerous']

# Splitting the data into training, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalizing the pixel values
X_train = X_train / 255.0
X_val = X_val / 255.0
X_test = X_test / 255.0

# Map the lesion types to 'cancerous' or 'non-cancerous'
images_28x28['cancerous'] = images_28x28['lesion_type'].map(cancerous_mapping)

# Extracting image data (pixels) and labels (cancerous or not)
X_28x28 = images_28x28.drop(['label', 'lesion_type', 'cancerous'], axis=1)
y_28x28 = images_28x28['cancerous']

# Splitting the data into training, validation, and test sets (70%, 15%, 15%)
X_train_28, X_temp_28, y_train_28, y_temp_28 = train_test_split(X_28x28, y_28x28, test_size=0.3, random_state=42)
X_val_28, X_test_28, y_val_28, y_test_28 = train_test_split(X_temp_28, y_temp_28, test_size=0.5, random_state=42)

# Normalizing the pixel values
X_train_28 = X_train_28 / 255.0
X_val_28 = X_val_28 / 255.0
X_test_28 = X_test_28 / 255.0

# Checking the shape of the datasets
X_train_28.shape, X_val_28.shape, X_test_28.shape, y_train_28.shape, y_val_28.shape, y_test_28.shape



((7010, 784), (1502, 784), (1503, 784), (7010,), (1502,), (1503,))

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Defining and training the logistic regression model on the 28x28 dataset
log_reg_model_28 = LogisticRegression(max_iter=1000)
log_reg_model_28.fit(X_train_28, y_train_28)

# Making predictions on the test set
y_pred_28 = log_reg_model_28.predict(X_test_28)

# Evaluating the model
accuracy_28 = accuracy_score(y_test_28, y_pred_28)
report_28 = classification_report(y_test_28, y_pred_28)

print(accuracy_28)
print(report_28)


0.8569527611443779
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      1290
           1       0.44      0.04      0.07       213

    accuracy                           0.86      1503
   macro avg       0.65      0.51      0.50      1503
weighted avg       0.80      0.86      0.80      1503

