In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import log_loss

from xgboost import XGBClassifier

In [None]:
train_ds = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
train_ds.head()

In [None]:
train_ds.info()

In [None]:
train_ds.nunique().sort_values(ascending=False)[:30]

In [None]:
train_ds['feature_15'].sort_values(ascending=False)[:20]

In [None]:
le = LabelEncoder()
y = le.fit_transform(train_ds["target"])
x = train_ds.drop(['id', 'target'], axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200, 400]
train_results = []
test_results = []

for estimator in n_estimators:
    rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
    rf.fit(X_train, y_train)   
    train_pred = rf.predict_proba(X_train)   
    loss = log_loss(y_train, train_pred)
    train_results.append(loss)   
    y_pred = rf.predict_proba(X_valid)   
    test_loss = log_loss(y_valid, y_pred)
    test_results.append(test_loss)

line1, = plt.plot(n_estimators, train_results, 'b', label="Train Log Loss")
line2, = plt.plot(n_estimators, test_results, 'r', label='Test Log Loss')

plt.ylabel('‘log loss’')
plt.xlabel('‘n_estimators’')
plt.show()

In [None]:
n_depths = np.linspace(1, 16, 16, endpoint=True)
train_results = []
test_results = []

for depth in n_depths:
    rf = RandomForestClassifier(max_depth=depth, n_jobs=-1)
    rf.fit(X_train, y_train)   
    train_pred = rf.predict_proba(X_train)   
    loss = log_loss(y_train, train_pred)
    train_results.append(loss)   
    y_pred = rf.predict_proba(X_valid)   
    test_loss = log_loss(y_valid, y_pred)
    test_results.append(test_loss)

line1, = plt.plot(n_depths, train_results, 'b', label="Train Log Loss")
line2, = plt.plot(n_depths, test_results, 'r', label='Test Log Loss')

plt.ylabel('‘log loss’')
plt.xlabel('‘n_depths’')
plt.show()

In [None]:
# Select estimators and depth according to previous test
clf = RandomForestClassifier(n_estimators=100, max_depth=5)# Reading test data

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict_proba(X_valid)
logloss = log_loss(y_valid, y_pred)
print(f'log loss: {logloss}')

In [None]:
test_ds = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
X_test = test_ds.drop(['id'], axis=1)

In [None]:
n_estimators = [10, 20, 40, 80, 160, 320, 640, 1000]
train_results = []
test_results = []

for estimator in n_estimators:
    rf = XGBClassifier(n_estimators=estimator, n_jobs=-1, early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], verbose=False)
    rf.fit(X_train, y_train)   
    train_pred = rf.predict_proba(X_train)   
    loss = log_loss(y_train, train_pred)
    train_results.append(loss)   
    y_pred = rf.predict_proba(X_valid)   
    test_loss = log_loss(y_valid, y_pred)
    test_results.append(test_loss)

line1, = plt.plot(n_estimators, train_results, 'b', label="Train Log Loss")
line2, = plt.plot(n_estimators, test_results, 'r', label='Test Log Loss')

plt.ylabel('‘log loss’')
plt.xlabel('‘n_estimators’')
plt.show()

In [None]:
clf = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
clf.fit(X_train, y_train, early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], verbose=0)

In [None]:
y_pred = clf.predict_proba(X_valid)
logloss = log_loss(y_valid, y_pred)
print(f'log loss: {logloss}')

In [None]:
test_ds = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
X_test = test_ds.drop(['id'], axis=1)

In [None]:
# Creating predictions to be submitted
predictions = clf.predict_proba(X_test)
sub = pd.DataFrame(predictions, columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
sub = pd.concat([test_ds['id'], sub], axis=1)
sub.head()

# Creating submission
sub.to_csv('submission.csv', index=False)