In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

import os

In [2]:
train = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test  = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
sub   = pd.read_csv('../input/siim-isic-melanoma-classification/sample_submission.csv')

train.shape, test.shape, sub.shape

((33126, 8), (10982, 5), (10982, 2))

### Imputing missing values

In [3]:
train['sex'] = train['sex'].fillna('na')
train['age_approx'] = train['age_approx'].fillna(0)
train['anatom_site_general_challenge'] = train['anatom_site_general_challenge'].fillna('na')

test['sex'] = test['sex'].fillna('na')
test['age_approx'] = test['age_approx'].fillna(0)
test['anatom_site_general_challenge'] = test['anatom_site_general_challenge'].fillna('na')

In [4]:
train.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [5]:
test.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,na
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


### Grouped target mean by features in training set

In [6]:
feat = ['sex','age_approx','anatom_site_general_challenge']
grp_mean_train = train.groupby(feat)['target'].agg(['mean']) \
                .reset_index().rename(columns={'mean': 'baseline'})

grp_mean_train

Unnamed: 0,sex,age_approx,anatom_site_general_challenge,baseline
0,female,0.0,head/neck,0.00
1,female,0.0,torso,0.00
2,female,10.0,head/neck,0.00
3,female,10.0,lower extremity,0.00
4,female,10.0,oral/genital,0.00
...,...,...,...,...
214,male,90.0,upper extremity,0.75
215,na,0.0,head/neck,0.00
216,na,0.0,lower extremity,0.00
217,na,0.0,torso,0.00


### Join grouped mean values with test set

In [7]:
test = test.merge(grp_mean_train, on=feat, how='left' )
test.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,baseline
0,ISIC_0052060,IP_3579794,male,70.0,na,0.0
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity,0.008147
2,ISIC_0058510,IP_7960270,female,55.0,torso,0.014583
3,ISIC_0073313,IP_6375035,female,50.0,torso,0.008273
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity,0.007273


### Impute missing values for test set baseline
Using mean value of target variable in the training set

In [8]:
M = train.target.mean()
M

0.01762965646320111

In [9]:
test['baseline'] = test['baseline'].fillna(M)

### Make submission using baseline predictions

In [10]:
sub.target = test.baseline.values
sub.head(10)

Unnamed: 0,image_name,target
0,ISIC_0052060,0.0
1,ISIC_0052349,0.008147
2,ISIC_0058510,0.014583
3,ISIC_0073313,0.008273
4,ISIC_0073502,0.007273
5,ISIC_0074618,0.013636
6,ISIC_0076801,0.022989
7,ISIC_0077586,0.013636
8,ISIC_0082004,0.015086
9,ISIC_0082785,0.018727


In [11]:
sub.to_csv( 'submission.csv', index=False )