In [1]:
!pip install xgboost



In [80]:
import numpy as np
np.random.seed(777)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV

In [81]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [82]:
from sklearn.preprocessing import LabelEncoder
features_target = ['target'] 

for feature in features_target:
    le = LabelEncoder()
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])
    
print(train_df['target'].head())

0    5
1    5
2    1
3    7
4    1
Name: target, dtype: int64


In [83]:
import datetime;
print(datetime.datetime.now())

missing_columns = ['id', 'feature_47', 'feature_7']

X_train = train_df.drop(missing_columns, 1)
X_train = X_train.drop('target', 1)
y_train = train_df['target']

X_test = test_df.drop(missing_columns, 1)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

parameters = { 'seed': [777], 'n_estimators': [1000],
               'learning_rate': [0.05], 'max_depth': [4], 'subsample': [0.8] }

clf = GridSearchCV(xgb_model, parameters,
                   cv=StratifiedKFold(n_splits=25, shuffle=True),
                   n_jobs=1, verbose=4, refit=True, scoring='neg_log_loss')

clf.fit(X_train, y_train)
print(datetime.datetime.now())

2021-06-23 02:20:25.447928
Fitting 25 folds for each of 1 candidates, totalling 25 fits
[CV 1/25] END learning_rate=0.05, max_depth=4, n_estimators=1000, seed=777, subsample=0.8;, score=-1.749 total time=22.5min
[CV 2/25] END learning_rate=0.05, max_depth=4, n_estimators=1000, seed=777, subsample=0.8;, score=-1.757 total time=23.7min
[CV 3/25] END learning_rate=0.05, max_depth=4, n_estimators=1000, seed=777, subsample=0.8;, score=-1.753 total time=23.6min
[CV 4/25] END learning_rate=0.05, max_depth=4, n_estimators=1000, seed=777, subsample=0.8;, score=-1.751 total time=23.8min
[CV 5/25] END learning_rate=0.05, max_depth=4, n_estimators=1000, seed=777, subsample=0.8;, score=-1.751 total time=23.8min
[CV 6/25] END learning_rate=0.05, max_depth=4, n_estimators=1000, seed=777, subsample=0.8;, score=-1.754 total time=23.8min
[CV 7/25] END learning_rate=0.05, max_depth=4, n_estimators=1000, seed=777, subsample=0.8;, score=-1.758 total time=23.7min
[CV 8/25] END learning_rate=0.05, max_depth=

In [84]:
print(datetime.datetime.now())
test_preds = clf.predict_proba(X_test)
print(datetime.datetime.now())

2021-06-23 13:17:31.246864
2021-06-23 13:17:35.171502


In [86]:
submission = pd.DataFrame(test_preds)
submission.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
submission['id'] = test_df['id']

submission.to_csv("submission.csv", index=False)