In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pydicom as dicom
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
df.info()
df = df.rename(columns = {'anatom_site_general_challenge':'site'})
df.drop(["target"],axis = 1,inplace = True)
print(df)


In [3]:
df.isnull().sum()


In [4]:

df["age_approx"].fillna(value = df["age_approx"].mean(),inplace = True)
df["sex"].fillna(value = df["sex"].mode()[0],inplace = True)
df["site"].fillna(value = df["site"].mode()[0],inplace = True)
df.isnull().sum()


****Data Visualization and Analysis



In [5]:


sex_values = []
for k in range(len(df["benign_malignant"])):
    if df["benign_malignant"][k]=="malignant":
        sex_values.append(df["sex"][k])
se = ["male",'female']
val = [sex_values.count("male"),sex_values.count("female")]
plt.bar(se,val)
plt.xlabel("sex")
plt.ylabel("Count")
plt.show()


#Male suffers more than female

In [6]:
ages = []
for k in range(len(df["benign_malignant"])):
    if df["benign_malignant"][k]=="malignant":
        ages.append(df["age_approx"][k])
sns.displot(ages)

#People age between 50 to 80 suffers more

In [7]:
sites = []
for k in range(len(df["benign_malignant"])):
    if df["benign_malignant"][k]=="malignant":
        sites.append(df["site"][k])
sns.countplot(sites)

# torso is the major site

In [8]:

# Patient id is not useful
df.drop(["patient_id"],axis =1,inplace = True)

ENCODINGS OF  CATEGORICAL FIELDS


In [9]:

df["sex"].replace(["male","female"],[0,1],inplace = True)
df["benign_malignant"].replace(["benign","malignant"],[0,1],inplace = True)
df = pd.get_dummies(df,columns  = ["site","diagnosis"],drop_first = True)
print(df)

CORELATIONS BETWEEN COLUMNS AND BENIGN_MALIGNANT COLUMN

In [10]:
sns.heatmap(df.corr())
print(df.corr())

In [11]:
sns.heatmap(df.corr()[["benign_malignant"]].sort_values('benign_malignant').tail(16),annot = True)

# The benign_malignant column depends on sex,site_upper extremity columns,age_approx

In [12]:
image = '/kaggle/input/siim-isic-melanoma-classification/train/' + df['image_name'][91] +'.dcm'
ds = dicom.dcmread(image)

plt.imshow(ds.pixel_array)

**TRAINING BASED ON IMAGES**

In [13]:
images = []
for x in range(len(df["image_name"][:270])):
    image = '/kaggle/input/siim-isic-melanoma-classification/train/' + df["image_name"][x] +'.dcm'
    ds = dicom.dcmread(image)
    pixels = ds.pixel_array

    arr = pixels.flatten()

    images.append(arr)
import tensorflow as tf
images = tf.keras.preprocessing.sequence.pad_sequences(
  images,
  maxlen = 256,
  dtype = "int32",
  padding = "pre",
  truncating = "pre",
  value = 0
)


test = df.tail(100)
test_images = []
for x in test['image_name']:
    image = '/kaggle/input/siim-isic-melanoma-classification/train/' + x +'.dcm'
    ds = dicom.dcmread(image)
    pixels = ds.pixel_array
    
    test_images.append(pixels.flatten())
test_images = tf.keras.preprocessing.sequence.pad_sequences(
  test_images,
  maxlen = 256,
  dtype = "int32",
  padding = "pre",
  truncating = "pre",
  value = 0
)



In [14]:
from sklearn.linear_model import LogisticRegression
X = images
y = df["benign_malignant"][:270]
classifier_lr = LogisticRegression(solver = 'liblinear')
classifier_lr.fit(X,y)
X_test = test_images
y_test = df["benign_malignant"].tail(100)
y_pred_lr = classifier_lr.predict(X_test)
print(accuracy_score(y_test,y_pred_lr))
print(confusion_matrix(y_test,y_pred_lr))

In [15]:
from sklearn import svm
X = images
y = df["benign_malignant"][:270]
classifier_svm = svm.SVC()
classifier_svm.fit(X,y)
X_test = test_images
y_test = df["benign_malignant"].tail(100)
y_pred_svm = classifier_svm.predict(X_test)
print(accuracy_score(y_test,y_pred_svm))
print(confusion_matrix(y_test,y_pred_svm))

In [16]:
from sklearn.tree import DecisionTreeClassifier
X = images
y = df["benign_malignant"][:270]
classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(X,y)
X_test = test_images
y_test = df["benign_malignant"].tail(100)

y_pred_dt = classifier_dt.predict(X_test)
print(accuracy_score(y_test,y_pred_dt))
print(confusion_matrix(y_test,y_pred_dt))

In [17]:
from sklearn.ensemble import RandomForestClassifier
X = images
y = df["benign_malignant"][:270]
classifier_rf = RandomForestClassifier()
classifier_rf.fit(X,y)
X_test = test_images
y_test = df["benign_malignant"].tail(100)
y_pred_rf = classifier_rf.predict(X_test)
print(accuracy_score(y_test,y_pred_rf))
print(confusion_matrix(y_test,y_pred_rf))


**TRAINING BASED ON RELATED FEATURES**

In [18]:
sns.heatmap(df.corr()[["benign_malignant"]].sort_values('benign_malignant').tail(16),annot = True)

In [19]:
train = []
ma =df["age_approx"][:270].max()
for k in range(270):
    train.append([df["diagnosis_melanoma"][k],df["site_oral/genital"][k],df["site_upper extremity"][k],(df["age_approx"][k])/ma])


In [20]:
test = []
ind = 0
ma =  df["age_approx"][33026:].max()

for k in range(33026,33126):

    test.append([df["diagnosis_melanoma"][k],df["site_oral/genital"][k],df["site_upper extremity"][k],(df["age_approx"][k])/ma])
    

In [21]:
from sklearn.linear_model import LogisticRegression
X = train
y = df["benign_malignant"][:270]
classifier_lr = LogisticRegression(solver = 'liblinear')
classifier_lr.fit(X,y)
X_test = test
y_test = df["benign_malignant"].tail(100)
y_pred_lr = classifier_lr.predict(X_test)
print(accuracy_score(y_test,y_pred_lr))
print(confusion_matrix(y_test,y_pred_lr))

In [22]:
from sklearn import svm
X = train
y = df["benign_malignant"][:250]
classifier_svm = svm.SVC()
classifier_svm.fit(X,y)
X_test = test
y_test = df["benign_malignant"].tail(100)
y_pred_svm = classifier_svm.predict(X_test)
print(accuracy_score(y_test,y_pred_svm))
print(confusion_matrix(y_test,y_pred_svm))

In [None]:
from sklearn.tree import DecisionTreeClassifier
X = train
y = df["benign_malignant"][:270]
classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(X,y)
X_test = test
y_test = df["benign_malignant"].tail(100)

y_pred_dt = classifier_dt.predict(X_test)
print(accuracy_score(y_test,y_pred_dt))
print(confusion_matrix(y_test,y_pred_dt))

In [None]:
from sklearn.ensemble import RandomForestClassifier
X = train
y = df["benign_malignant"][:270]
classifier_rf = RandomForestClassifier()
classifier_rf.fit(X,y)
X_test = test
y_test = df["benign_malignant"].tail(100)
y_pred_dt = classifier_rf.predict(X_test)
print(accuracy_score(y_test,y_pred_dt))
print(confusion_matrix(y_test,y_pred_dt))


**PEDICTION OF AN IMAGE**

In [None]:
image_path = '/kaggle/input/siim-isic-melanoma-classification/train/' + df['image_name'][91] +'.dcm'

image_to_test = []
ds = dicom.dcmread(image_path)
pixels = ds.pixel_array
image_to_test.append(pixels.flatten())
image_to_test = tf.keras.preprocessing.sequence.pad_sequences(
  image_to_test,
  maxlen = 256,
  dtype = "int32",
  padding = "pre",
  truncating = "pre",
  value = 0
)

np.append(image_to_test,[df["sex"][91],df["site_upper extremity"][91],df["age_approx"][91]],axis = None)

if classifier_lr.predict(image_to_test) == [1]:
    plt.imshow(pixels)
    print('Malignant:Need immediate Treatment')
else:
    plt.imshow(pixels)
    print('Benign')
