In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import optuna
from optuna import visualization,Trial
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna
sns.set()

In [None]:
# read files
input_dir='../input/tabular-playground-series-aug-2021'
train_df=pd.read_csv(os.path.join(input_dir,'train.csv'))
test_df=pd.read_csv(os.path.join(input_dir,'test.csv'))
submission_file=pd.read_csv(os.path.join(input_dir,'sample_submission.csv'))

In [None]:
# quick look at data 
train_df.describe().T.style

In [None]:
# losses range from 0 to 42 with no negative values, it would be betterto use relu in the output layer to get positive regression values 
plt.figure(figsize=(10,5))
m=train_df['loss'].value_counts().sort_values()

sns.barplot(x=m.index,y=m.values,)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(20,20))
corr=train_df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,cmap='Blues')

In [None]:
# to check if there are any categorical features
columns=train_df.columns
for col in columns:
    print(f'column : {col} \t nunique : {train_df[col].nunique()}')

In [None]:
# looking at F1
plt.figure(figsize=(10,10))
sns.histplot(x=train_df['f1'],color='r')

In [None]:
train_df['f1'].unique()

In [None]:
plt.figure(figsize=(10,100))
for i,col in enumerate(columns[:-1]):
    plt.subplot(34,3,i+1)
    plt.hist(x=train_df[col],bins=25,color='teal')
    plt.hist(x=test_df[col],bins=25,color='azure',alpha=0.5)
    plt.title(f'{col}')
plt.tight_layout()

In [None]:
# scaling the data
#sr=StandardScaler()
#train_data=sr.fit_transform(train_df.iloc[:,:-1])
#test_data=sr.transform(test_df.iloc[:,1:])

In [None]:
train_df.drop(['id'],axis=1,inplace=True)

In [None]:
train_df['split']=pd.qcut(train_df.iloc[:,-1],q=3,labels=[1,2,3])

In [None]:
# Taking a small fraction of data for final evaluation 
X_train, X_test, y_train, y_test = train_test_split(train_df.drop('loss',axis=1).values,train_df['loss'].values, test_size=0.04, random_state=42,stratify=train_df['split'])

In [None]:
X_test=X_test[:,:-1]

In [None]:
def model_train_classic(model2,X_train):
    fold=StratifiedKFold(5,shuffle=True)
    pred=np.zeros((240000,))

    f=0
    loss=[]
    label=X_train[:,-1]
    X_train=X_train[:,:-1]
    for train_id,test_id in fold.split(X_train,label):
        train_x=X_train[train_id]
        train_y=y_train[train_id]
        test_x=X_train[test_id]
        test_y=y_train[test_id]
        print('---------------------------validation fold : ',f,'-----------------------------------')
       
        model2.fit(train_x,train_y)

        loss.append(model2.predict(test_df))
        pred[test_id]=model2.predict(test_x)
        f+=1
    eval_(model2)
    return loss,pred
def eval_(model):
    print(model.predict(X_test).shape,y_test.shape)
    mse=mean_squared_error(y_test,model.predict(X_test),squared=False)
    print(mse)
def to_csv(loss,filename):
    losses=np.array(loss).mean(axis=0).astype(int)
    submission_file['loss']=losses
    submission_file.to_csv(filename,index=False)

In [None]:
test_df.drop('id',axis=1,inplace=True)

In [None]:
#optuna xgboost tuning
def optuna_tune(trial,x=X_train,y=y_train):
   
    param={
        'tree_method':'gpu_hist',
        'n_estimators':trial.suggest_categorical('n_estimators',[600,1000,]),
        'learning_rate': trial.suggest_categorical('learning_rate',[0.1,0.2,0.3,0.4,0.5]),
        'colsample_bytree':trial.suggest_categorical('colsample_bytree',[0.5,0.75,1]),
        'subsample':trial.suggest_categorical('colsample_bytree',[0.5,0.75,1]),
        "booster": "gbtree",
        'n_jobs':-1,
        'max_depth':trial.suggest_categorical('max_depth',[6,8,10,12,14,16]),
        'eval_metric':'rmse'
        
    }
        
    xgb=XGBRegressor(**param)
    trainx, testx, trainy, testy = train_test_split(X_test, y_test, test_size=0.25, random_state=42)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-rmse")
            
    xgb.fit(trainx,trainy,eval_set=[(testx,testy)],early_stopping_rounds=20)
    pred=xgb.predict(testx)
    error=mean_squared_error(testy,pred,squared=False)
    eval_(xgb)
    return error

In [None]:
25*60

In [None]:
#study=optuna.create_study(direction='minimize')
#study.optimize(optuna_tune,n_jobs=-1,timeout=1000)
#print('Number of finished trials:', len(study.trials))
#print('Best trial:', study.best_trial.params)

In [None]:
#study.best_params

In [None]:
params={'n_estimators': 1000,
 'learning_rate': 0.1,
 'colsample_bytree': 0.75,
 'booster': 'gbtree',
 'tree_method':'gpu_hist',
 'n_jobs':-1,
 'max_depth': 6,

       }
model=XGBRegressor(**params)

In [None]:
loss,pred=model_train_classic(model,X_train)

In [None]:
X_test.shape

In [None]:
m=pd.DataFrame((np.round(pred)))[0].value_counts()
n=train_df.loss.value_counts()
sns.barplot(x=m.index,y=m.values,color='teal',label='true')
sns.barplot(x=n.index,y=n.values,alpha=0.9,color='gray',label='predicted')
plt.xticks(rotation=90);
plt.legend()

In [None]:
to_csv(loss,'sub11.csv')