# Imports
---

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

# Load
---

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv', index_col='id')
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv', index_col='id')
sub = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')

train.head()

In [None]:
X = train.drop('target', axis=1)
y = train.target

# Treatment
---

In [None]:
train.describe()

In [None]:
encoder = LabelEncoder()
X['f_27_enc'] = encoder.fit_transform(X['f_27'])

In [None]:
sns.lineplot(data=train, palette="tab10", linewidth=2.5)

In [None]:
sns.pairplot(train)

In [None]:
train.isna().sum().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train.corr(), annot=False, linewidths=.5)

# Ciclo 1 - Baseline
---

In [None]:
def modelxgb_score (X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
    
    clf1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=-1, random_state=42)
    
    clf1.fit(X_train, y_train, eval_set = [(X_test, y_test)],
        eval_metric = ['auc'], early_stopping_rounds = 256, verbose = 250)
    
    y_pred = clf1.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred)

In [None]:
features1 = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08',
             'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17',
             'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26',
             'f_28', 'f_29', 'f_30', 'f_27_enc']

In [None]:
modelxgb_score(X[features1], y)
# 0.92570 with label encoding f_27

In [None]:
# 0.97905 logistic
# 0.97918 logitraw


# Ciclo 2 - Feature Engineering e Baseline 2
---

In [None]:
def count_sequence(df, field):
    alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']    
    
    for letter in alphabet:
        df[letter + '_count'] = df[field].str.count(letter)
    return df

In [None]:
X = count_sequence(X, 'f_27')

In [None]:
def n_unique(row):
    unique_count = len(set(row["f_27"]))
    return unique_count
X["unique_characters"] = X.apply(n_unique, axis=1)

In [None]:
X['unique_characters'].value_counts()

In [None]:
features2 = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08',
             'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17',
             'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26',
             'f_28', 'f_29', 'f_30', 'A_count', 'B_count',
             'C_count', 'D_count', 'E_count', 'F_count', 'G_count', 'H_count',
             'I_count', 'J_count', 'K_count', 'L_count', 'M_count', 'N_count',
             'O_count', 'P_count', 'Q_count', 'R_count', 'S_count', 'T_count']

features3 = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08',
             'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17',
             'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26',
             'f_28', 'f_29', 'f_30', 'f_27_enc', 'A_count', 'B_count',
             'C_count', 'D_count', 'E_count', 'F_count', 'G_count', 'H_count',
             'I_count', 'J_count', 'K_count', 'L_count', 'M_count', 'N_count',
             'O_count', 'P_count', 'Q_count', 'R_count', 'S_count', 'T_count']

features4 = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07', 'f_08',
             'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17',
             'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26',
             'f_28', 'f_29', 'f_30', 'f_27_enc', 'A_count', 'B_count',
             'C_count', 'D_count', 'E_count', 'F_count', 'G_count', 'H_count',
             'I_count', 'J_count', 'K_count', 'L_count', 'M_count', 'N_count',
             'O_count', 'P_count', 'Q_count', 'R_count', 'S_count', 'T_count', 'unique_characters']

In [None]:
X.head()

In [None]:
%%time
modelxgb_score(X[features2], y)

In [None]:
%%time
modelxgb_score(X[features3], y)

In [None]:
%%time
modelxgb_score(X[features4], y)

# Data Test
---

In [None]:
test['f_27_enc'] = encoder.fit_transform(test['f_27'])

In [None]:
test = count_sequence(test, 'f_27')

In [None]:
test["unique_characters"] = test.apply(n_unique, axis=1)

In [None]:
test.head()

# Predicion
---

In [None]:
def subger (X, y, test):
    clf1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=-1, eval_metric='auc', random_state=42,)

    clf1.fit(X, y, verbose = 250)

    pred = clf1.predict_proba(test)[:, 1]
    return pred

In [None]:
sub1 = subger(X[features1], y, test[features1])
sub2 = subger(X[features2], y, test[features2])
sub3 = subger(X[features3], y, test[features3])
sub4 = subger(X[features4], y, test[features4])
sub5 = subger(X[features4], y, test[features4])
sub6 = subger(X[features4], y, test[features4])
sub7 = subger(X[features4], y, test[features4])

In [None]:
def subgerFinal (X, y, test):
    clf1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=-1, eval_metric='auc', random_state=42,
                             n_estimators=8192, min_child_weight=96, max_bin=512,
                             objective='binary:logitraw')

    clf1.fit(X, y, verbose = 250)

    pred = clf1.predict_proba(test)[:, 1]
    return pred

# Submission
---

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
sub['target'] = np.round(subgerFinal(X[features4],y,test[features4]),2)
sub.to_csv('sub8.csv', index=False)
sub

In [None]:
m = X.join(y)

In [None]:
m.to_csv('tps_may_22.csv')