In [None]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import pydicom as dicom
import pandas as pd
import numpy as np
import tensorflow as tf
from statistics import mean
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import model_selection


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
SEED = 42

In [None]:
df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
df.head()

In [None]:
train_melanoma = df[df['target'] == 1]
train_benign = df[df['target'] == 0].sample(n=len(train_melanoma), random_state=SEED)

In [None]:
train_melanoma.shape

In [None]:
df = pd.concat([train_melanoma, train_benign], ignore_index=True)

In [None]:
def pull_images(image_names):
    results = []
    for image_name in image_names:
        image = '../input/siim-isic-melanoma-classification/train/' + image_name +'.dcm'
        ds = dicom.dcmread(image)
        pixels = ds.pixel_array
        results.append(pixels.flatten())
    results = tf.keras.preprocessing.sequence.pad_sequences(
      results,
      maxlen = 720,
      dtype = "int32",
      padding = "pre",
      truncating = "pre",
      value = 0
    )
    return results
        

In [None]:
df = df.rename(columns={"anatom_site_general_challenge":"site", "age_approx": "age"})
df = df.drop(["patient_id"], axis=1)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(axis=0, how="any")

In [None]:
df.head()

In [None]:
sns.countplot(x = 'sex', data = df, hue = 'target')

In [None]:
sns.distplot(df['age'])

In [None]:
cancer_patients = df[df['target'] == 1]

In [None]:
cancer_patients_dist = cancer_patients[["age"]]

In [None]:
sns.distplot(cancer_patients_dist)

In [None]:
plt.subplot(1,2,2)
sns.countplot(y=cancer_patients_dist['age'])

In [None]:
df.head()

In [None]:
dummies = pd.get_dummies(df, columns=['site'])
dummies.head()

In [None]:
dummies = pd.get_dummies(dummies, columns=['sex'], drop_first=True)
dummies = pd.get_dummies(dummies, columns=['diagnosis'], drop_first=True)

dummies

In [None]:
def pull_details(X):  
    return X.reindex(columns=['sex_male', 'age', 'site_lower extremity', 'site_torso', 'site_upper extremity', 'site_head/neck'])

In [None]:
X = pull_details(dummies)
X

In [None]:
y = dummies['target']
y

In [None]:

kf = KFold(n_splits=4, shuffle=True)


def kfold_test(X, y, model):   
    scores = []
#     X['diagnosis_melanoma'] = 0
#     X['diagnosis_unknown'] = 1
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = model.fit(X_train, y_train)
        scores.append(accuracy_score(y_test, model.predict(X_test)))
    return mean(scores)
        

In [None]:
details_lr = kfold_test(X, y, LogisticRegression())

In [None]:
details_svc = kfold_test(X, y, svm.SVC())

In [None]:
details_dtc = kfold_test(X, y, DecisionTreeClassifier())

In [None]:
details_rfc = kfold_test(X, y, RandomForestClassifier())

In [None]:


details_gbc = kfold_test(X, y, GradientBoostingClassifier())

In [None]:

details_abc = kfold_test(X, y, AdaBoostClassifier())

In [None]:
x_l = ['Logistic Regression', 'Support Vector Machine', 'Decison Tree', 'Random Forest', 'Gradient Boosting', 'Adaptive Boosting' ]
y_l = [details_lr, details_svc, details_dtc, details_rfc, details_gbc, details_abc]

print(list(zip(x_l, y_l)))
sns.barplot(y_l, x_l,palette="rocket")
plt.xlim([0.5, 1])

In [None]:
file = "details_comparison.txt"
with open(file, "wb") as f:
    pickle.dump([x_l, y_l], f)
    
file = "columns.txt"
with open(file, "wb") as f:
    pickle.dump(X.columns, f)

In [None]:

X.head()

In [None]:
patient_details_model = AdaBoostClassifier()
patient_details_model.fit(X, y)

In [None]:
s0 = df.target[df.target.eq(0)].sample(100, random_state=SEED).index
s1 = df.target[df.target.eq(1)].sample(100,random_state=SEED).index 


image_data_sampled = df.loc[s0.union(s1)]

In [None]:
image_data_sampled

In [None]:
images = pull_images(image_data_sampled['image_name'])

In [None]:
def kfold_test_image(X, y, model):   
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = model.fit(X_train, y_train)
        scores.append(accuracy_score(y_test, model.predict(X_test)))
    return mean(scores)

In [None]:
X = images
y = np.array(image_data_sampled['target'])




image_lr = kfold_test_image(X, y, LogisticRegression(max_iter=4000))


In [None]:
image_svc = kfold_test_image(X, y, svm.SVC())

In [None]:
image_dtc = kfold_test_image(X, y, DecisionTreeClassifier())

In [None]:
image_rfc = kfold_test_image(X, y, RandomForestClassifier())

In [None]:
image_abc = kfold_test_image(X, y, AdaBoostClassifier())

In [None]:
image_gbc = kfold_test_image(X, y, GradientBoostingClassifier())

In [None]:
x_l = ['Logistic Regression', 'Support Vector Machine', 'Decison Tree', 'Random Forest', 'Adaptive Boosting', 'Gradient Boosting']
y_l = [image_lr, image_svc, image_dtc, image_rfc, image_abc, image_gbc]

sns.barplot(y_l, x_l,palette="rocket")
plt.xlim([0.5, 1])

In [None]:
file = "image_comparison.txt"
with open(file, "wb") as f:
    pickle.dump([x_l, y_l], f)

In [None]:
patient_image_model = RandomForestClassifier()
patient_image_model.fit(X,y)

In [None]:
file1 = "model_patient_details.pkl"
with open(file1, "wb") as f:
    pickle.dump(patient_details_model, f)
    
    
file2 = "model_patient_image.pkl"
with open(file2, "wb") as f:
    pickle.dump(patient_image_model, f)
    
    
