## veri on isleme kismi

In [None]:
%matplotlib inline
from xgboost import plot_importance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
pd.set_option('display.max_columns', 150)
#libraries

In [None]:
#egitim ve test verilerinin okunmasi
train_df = pd.read_csv("/path/to/your/dataset/train.csv")
test_df = pd.read_csv("/path/to/your/dataset/test.csv")

In [None]:
# egitim verisinin ilk 10 gozlemi
train_df.head(10)

In [None]:
# egitim verisinin degisken tipleri
train_df.dtypes

In [None]:
# egitim ve test verisinin boyutlari (gozlem x degisken sayisi)
train_df.shape, test_df.shape

In [None]:
# egitim verisine ait ozet istatistikler
train_df.describe()

In [None]:
# kayip gozlem kontrolu
train_df.apply(lambda x: sum(x.isnull()),axis=0)

In [None]:
# surekli degiskenler icin heatmap cizdiriyoruz
cols= train_df.select_dtypes(include = ['float64']).iloc[:, 1:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(cols, vmax=1)

## xgboost ile egitim

In [None]:
# hedef ve silinecek sutunlari id ve target degiskenlerine atiyoruz
ID = 'id'
Target = 'loss'

In [None]:
y_train = train_df[Target].ravel()

In [None]:
# egitimden id ve target, test verisinden ise id sutunlarini cikariyoruz
train_df.drop([ID, Target], axis=1, inplace=True)
test_df.drop([ID], axis=1, inplace=True)

In [None]:
# degiskenler cikarildiktan sonra egitim ve test verisinin boyutlari
print("{},{}".format(train_df.shape, test_df.shape))

In [None]:
# egitim ve test verisini birlestiriyoruz
ntrain = train_df.shape[0]
train_test = pd.concat((train_df, test_df)).reset_index(drop=True)

In [None]:
# yeni degiskenimizin boyutlari
train_test.shape

In [None]:
# degisken isimlerine features adli degiskene atiyoruz
features = train_df.columns

In [None]:
# kategorik olan verileri factorize ile surekli hale getiriyoruz
cats = [feat for feat in features if 'cat' in feat]
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

In [None]:
# kategorik verileri surekli hale getirdikten sonra girdi olacak verimize goz atiyoruz
print(train_test.head())

In [None]:
# x icin egitim ve test verisini olusturuyoruz
x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])

In [None]:
print("{},{}".format(train_df.shape, test_df.shape))

In [None]:
# egitim ve test icin design matrix olusturuyoruz
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

In [None]:
# xgboost icin parametreleri olusturuyoruz
# detaylar -> https://xgboost.readthedocs.io/en/latest/how_to/param_tuning.html
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.9,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 10,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
    'gamma':0.005,
    
    }

In [None]:
# modeli calistiriyoruz
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=0, stratified=False,
             early_stopping_rounds=5, verbose_eval=20, show_stdv=True)

In [None]:
# en iyi iterasyonu buluyoruz
best_nrounds = res.shape[0] - 1

In [None]:
# cv ile hesaplanan ortalama hata
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]
print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))

In [None]:
# cv ile buldugumuz en iyi iterasyon sayisi kadar modeli egitiyoruz
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

## Oznitelik Onemi

In [None]:
import operator 
def ceate_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1

    outfile.close()
    
ceate_feature_map(features)

importance = gbdt.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

df.to_csv('/path/to/your/folder', sep='\t', encoding='utf-8', index=False)

In [None]:
# ozniteliklerin onemi icin grafik cizdiriyoruz.
plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(12, 24))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
# plt.gcf().savefig('feature_importance_xgb.png') png olarak kaydetmek istersek.

## Gonderim

In [None]:
# kaggle'a gonderim yapmak icin dosyayi kaydediyoruz
submission = pd.read_csv("sample_submission.csv")
submission.iloc[:, 1] = gbdt.predict(dtest)
submission.to_csv('my_submission.csv', index=None)