## This notebook aims at some of basic preprocessing steps of tabular data available with this competition.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os


In [None]:
sns.set(style='darkgrid')

In [None]:
train = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
sample = pd.read_csv('../input/siim-isic-melanoma-classification/sample_submission.csv')

In [None]:
test.head()

Filling unknown values with relevant data

In [None]:
train['sex'].fillna('unknown', inplace=True)
test['sex'].fillna('unknown', inplace=True)

train['age_approx'].fillna(train['age_approx'].mode().values[0], inplace=True)
test['age_approx'].fillna(test['age_approx'].mode().values[0], inplace=True)

train['anatom_site_general_challenge'].fillna('unknown', inplace=True)
test['anatom_site_general_challenge'].fillna('unknown', inplace=True)


Label Encoding categorical values

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

train['sex_enc'] = enc.fit_transform(train.sex.astype('str'))
test['sex_enc'] = enc.transform(test.sex.astype('str'))

train['age_enc'] = enc.fit_transform(train.age_approx.astype('str'))
test['age_enc'] = enc.transform(test.age_approx.astype('str'))

train['anatom_enc'] = enc.fit_transform(train.anatom_site_general_challenge.astype('str'))
test['anatom_enc'] = enc.transform(test.anatom_site_general_challenge.astype('str'))


In [None]:
train.head(5)

### Mean Normalization

In [None]:
train['age_enc'] = train['age_enc'] / np.mean(train['age_enc'])
test['age_enc'] = test['age_enc'] / np.mean(test['age_enc'])

train['anatom_enc'] = train['anatom_enc'] / np.mean(train['anatom_enc'])
test['anatom_enc'] = test['anatom_enc'] / np.mean(test['anatom_enc'])

train.head()

## Building our XGBoost Regression model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import roc_auc_score


In [None]:
train.columns

Selecting only relevant feature columns from tabular data

In [None]:
features = [
            'sex_enc',
            'age_enc',
            'anatom_enc'
]

In [None]:
X = train[features]
y = train['target']

x_test = test[features]

In [None]:
model = XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
                    importance_type='gain', interaction_constraints=None,
                     min_child_weight=1, missing=None, monotone_constraints=None,
                     n_estimators=700, n_jobs=-1, nthread=-1, num_parallel_tree=1
                    )

kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
cv_results = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc', verbose=3)
cv_results.mean()

In [None]:
xgb =XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
                    importance_type='gain', interaction_constraints=None,
                     min_child_weight=1, missing=None, monotone_constraints=None,
                     n_estimators=700, n_jobs=-1, nthread=-1, num_parallel_tree=1
                    )
xgb.fit(X, y)
pred = xgb.predict(x_test)

In [None]:
sub = pd.DataFrame({'image_name':test.image_name.values,
                    'target':pred})
sub.to_csv('submission.csv',index = False)

In [None]:
sub.head()

# If you like my work, do upvote :)