In [None]:
import numpy as np 
import pandas as pd
import os

from xgboost import XGBRegressor
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, train_test_split 
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso

from sklearn import metrics

import optuna
from optuna import Trial, visualization

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
train.drop(['id'],inplace = True,axis = 1)
test.drop(['id'],inplace = True,axis = 1)
train.head()

In [None]:
feature_cols = [col for col in test.columns.tolist()]

In [None]:
# model 0: binary

train0 = train.loc[train['loss'] < 6].reset_index(drop= True)
target = pd.Series(train0['loss'])


#target = train['loss']

In [None]:
import statsmodels.api as sm
model = sm.Probit(target, train0[feature_cols]).fit()
print(model.summary())

In [None]:
#from sklearn.svm import SVR
from sklearn.svm import LinearSVR

In [None]:
from sklearn import metrics

y_pred = int(clf.predict(train[feature_cols]))

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(target, y_pred))

In [None]:
lambdas = (0.001, 0.01, 0.1, 0.5, 1, 2, 10)
l_num = 7
pred_num = train[feature_cols].shape[1]

# prepare data for enumerate
coeff_a = np.zeros((l_num, pred_num))
train_r_squared = np.zeros(l_num)
test_r_squared = np.zeros(l_num)

# enumerate through lambdas with index and i
for ind, i in enumerate(lambdas):    
    reg = Lasso(alpha = i)
    reg.fit(train[feature_cols],target)

    coeff_a[ind,:] = reg.coef_
    train_r_squared[ind] = reg.score(train[feature_cols],target)
    test_r_squared[ind] = reg.score(train[feature_cols],target)


In [None]:
plt.figure(figsize=(18, 8))
plt.plot(train_r_squared, 'bo-', label=r'$R^2$ Training set', color="darkblue", alpha=0.6, linewidth=3)
plt.plot(test_r_squared, 'bo-', label=r'$R^2$ Test set', color="darkred", alpha=0.6, linewidth=3)
plt.xlabel('Lamda index'); plt.ylabel(r'$R^2$')
plt.xlim(0, 6)
plt.title(r'Evaluate lasso regression with lamdas: 0 = 0.001, 1= 0.01, 2 = 0.1, 3 = 0.5, 4= 1, 5= 2, 6 = 10')
plt.legend(loc='best')
plt.grid()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train0[feature_cols], target, test_size = 0.2, 
                                                    random_state = 4042)

model = LinearSVR(random_state=0, tol=1e-5, verbose = 1)
model.fit(X_train,y_train)

y_pred =  round(pd.Series(model.predict(X_test)))

#y_pred.head(20)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model0.score(X_test, y_test)))

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# model 1 for loss > 0
train1 = train0.loc[train0['loss'] > 0]
target1 = train0['loss'].reset_index(drop = True)
train1 = train0[feature_cols].reset_index(drop = True)

In [None]:
kf = KFold(n_splits = 5, random_state = 4022, shuffle = True)

for i, (trn, val) in enumerate(kf.split(train[feature_cols])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [None]:
def fit_xgb(trial, xtr, ytr, xval, yval):
    params = {
        "n_estimators": trial.suggest_int("n_estimators",20,100),
        "subsample": trial.suggest_uniform("subsample", 0.5,0.9),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5,0.9),
        "eta": trial.suggest_uniform("eta",0.01,0.2),
        "max_depth": trial.suggest_int("max_depth",3,20),
        "reg_alpha": trial.suggest_int("reg_alpha",1,50),
        'tree_method': 'gpu_hist'
    }
    
    model = xgb.XGBRegressor(**params, 
                        objective='reg:tweedie',
                        #tree_method = 'gpu_hist',
                        predictor = 'gpu_predictor',
                        booster = 'gbtree',
                        n_jobs = 4,
                        random_state = 4022, eval_metric="rmse")
    
    model.fit(xtr, ytr.reshape(-1,))
    
    y_tr_pred = model.predict(xtr)
    y_val_pred = model.predict(xval)
    
    y_tr_pred = np.clip(y_tr_pred, 0.1, None)
    y_val_pred = np.clip(y_val_pred, 0.1, None)
    
    log = {
        "train rmse": np.sqrt(mean_squared_error(ytr, y_tr_pred)),
        "val rmse": np.sqrt(mean_squared_error(yval, y_val_pred))
    }
    
    return model, log

In [None]:
def objective(trial):
    rmse = 0
    for fold in range(5):
        trn_idx = train1['kfold'] != fold
        val_idx = train1['kfold'] == fold
        trn = train1.loc[trn_idx, :]
        val = train1.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, target1.loc[trn_idx].values
        xval, yval = val[feature_cols].values, target1.loc[val_idx].values
        
        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        rmse += log['val rmse']/5
        
    return rmse

In [None]:
study = optuna.create_study(direction = "minimize", study_name = 'censored model')
study.optimize(objective, n_trials = 2)

In [None]:
study.best_params

In [None]:
params_opt = params_opt = {
            'n_estimators': 53, 
            'subsample': 0.6463607479295777, 
            'colsample_bytree': 0.862577336784147, 
            'eta': 0.1120013893377593, 
            'max_depth': 11, 
            'reg_alpha': 29 }

In [None]:
clf = xgb.XGBRegressor(**(study.best_params))
clf.fit(train1[feature_cols], target1, eval_metric="rmse")

In [None]:
#Distribution of loss is censored
fig = plt.figure(figsize = (14,8))
prediccion = round(pd.Series(clf.predict(train0[feature_cols])))
target_cnt = prediccion.value_counts().sort_index()
sns.barplot(x = target_cnt.index,y = target_cnt)

In [None]:
predictions0 = model0.predict(test[feature_cols])
predictions1 = clf.predict(test[feature_cols])

predictions = np.multiply(predictions0, predictions1)

In [None]:
submission = pd.DataFrame({
    'id': np.asarray(sample_submission.id), 
    'loss': predictions.astype(int)
})

submission.to_csv('my_submission.csv', index = False)