In [None]:
# loading packages

import pandas as pd
import numpy as np

#

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#

import seaborn as sns
import plotly.express as px

#

import os
import random
import re
import math
import time

from tqdm import tqdm
from tqdm.keras import TqdmCallback


from pandas_summary import DataFrameSummary

import warnings


warnings.filterwarnings('ignore') # Disabling warnings for clearer outputs



seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)


In [None]:
# Setting color palette.
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]

# Setting plot styling.
plt.style.use('ggplot')

In [None]:
base_path = '/kaggle/input/siim-isic-melanoma-classification'
train_img_path = '/kaggle/input/siim-isic-melanoma-classification/jpeg/train/'
test_img_path = '/kaggle/input/siim-isic-melanoma-classification/jpeg/test/'
img_stats_path = '/kaggle/input/melanoma2020imgtabular'


# Loading the Data

We'll continue by loading metadata we're given. Train data has 8 features, 33126 observations and Test data 5 features, 10982 observations.
Train Dataset Consists Of:

    image name -> the filename of specific image for the train set
    patient_id -> identifies the unique patient
    sex -> gender of the patient
    age_approx -> approx age of the patient at time of scanning
    anatom_site_general_challenge -> location of the scan site
    diagnosis -> information about the diagnosis
    benign_malignant - indicates scan result if it's malignant or benign
    target -> same as above but better for modelling since it's binary

And the next dataset we going to inspect test. It has same features as train set except for scan results, well that's why it's test set right?!
Test Dataset Consists Of:

    image name -> the filename of specific image for the train set
    patient_id -> identifies the unique patient
    sex -> gender of the patient
    age_approx -> approx age of the patient at time of scanning
    anatom_site_general_challenge -> location of the scan site

[](http://)

In [None]:
# Loading train and test data.

train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
sample = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))

In [None]:
train.head()

In [None]:
# Checking train and test columns/rows.

print(
    f'Train data has {train.shape[1]} features, {train.shape[0]} observations and Test data {test.shape[1]} features, {test.shape[0]} observations.\nTrain features are:\n{train.columns.tolist()}\nTest features are:\n{test.columns.tolist()}'
)



In [None]:
# Renaming train/test columns:

train.columns = [
    'img_name', 'id', 'sex', 'age', 'location', 'diagnosis',
    'benign_malignant', 'target'
]
test.columns = ['img_name', 'id', 'sex', 'age', 'location']

In [None]:
#Taking Random Sample of Train Data
train.sample(6)

In [None]:
#Taking Random Sample of Test Data
test.sample(6)

In [None]:
#Check How Age and Sex depends upon being benign or maigant.
sns.boxplot(x=train['sex'],y=train['age'],hue=train['target'],data=train)