In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.stats

from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier

**Referred to this discussion by AmrosM**
https://www.kaggle.com/competitions/tabular-playground-series-apr-2022/discussion/318527

and also his notebook https://www.kaggle.com/code/ambrosm/tpsapr22-best-model-without-nn/notebook



In [None]:
train_df =pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
full = pd.concat([train_df,test_df])

In [None]:
dic = {}
for i in full.subject.unique():
  count = full.query('subject==@i').sequence.count()
  dic[i] = count


In [None]:
train_df['count'] = train_df.subject.map(dic)
test_df['count'] =test_df.subject.map(dic)

In [None]:
train_df = pd.merge(train_df,labels,on='sequence')

In [None]:
# add feature for total values of sensors 
sensors = [i for i in train_df.columns if 'sensor_' in i]
train_df['sensor_total'] = train_df[sensors].sum(axis=1)
train_df['sensor_std'] = train_df[sensors].std(axis=1)

test_df['sensor_total'] = test_df[sensors].sum(axis=1)
test_df['sensor_std'] = test_df[sensors].std(axis=1)

In [None]:
#add sensor_02 lag feature

train_df['sensor_02_diff'] = train_df.sensor_02.diff(1)
train_df.loc[train_df.step==0,'sensor_02_diff']=0

test_df['sensor_02_diff'] = test_df.sensor_02.diff(1)
test_df.loc[test_df.step==0,'sensor_02_diff']=0



In [None]:
#count up, down, remain movement from sensor_02_diff
def count_df(df):
    up = df.groupby(['subject','sequence'])['sensor_02_diff'].apply(lambda x:x[(x>0)].count())
    up.name = 'sensor_02_up'
    down= df.groupby(['subject','sequence'])['sensor_02_diff'].apply(lambda x:x[(x<0)].count())
    down.name = 'sensor_02_down'
    stay = df.groupby(['subject','sequence'])['sensor_02_diff'].apply(lambda x:x[(x==0)].count())
    stay.name = 'sensor_02_stay'
    
    df = df.merge(up,on=['subject','sequence'])
    df = df.merge(down,on=['subject','sequence'])
    df = df.merge(stay,on=['subject','sequence'])
    
    return df

train_df = count_df(train_df)
test_df = count_df(test_df)

In [None]:
import seaborn as sns
subject_47 = train_df.query('subject==47')
for i in range(13):
       plt.figure(figsize=(10,10))
       if i <10:
          sns.scatterplot(x='step',y='sensor_0'+str(i),data=subject_47,hue='state')
       else: 
          sns.scatterplot(x='step',y='sensor_'+str(i),data=subject_47,hue='state')
       plt.title('Sensor '+ str(i))

In [None]:
subject_47.sequence.nunique()

# Sensor 2

In [None]:
#to do color coded
colors= []
for i in (subject_47.groupby('sequence').state.min()):
  if i==0:
        colors.append('b')
  else:
        colors.append('r')

In [None]:
# sensor_02 is unique. Let's have a look into it in detail 

pd.pivot_table(subject_47,index='step',columns='sequence',values='sensor_02').plot(subplots=True,layout=(10,11),figsize=(50,70),color=colors)
plt.title('Blue(status 0) Red(status 1)')
plt.show()



# Sensor 11

In [None]:
pd.pivot_table(subject_47,index='step',columns='sequence',values='sensor_11').plot(subplots=True,layout=(10,11),figsize=(50,70),color=colors)
plt.title('Blue(status 0) Red(status 1)')
plt.show()


# Extra Feature Engineering

In [None]:
def first_last_difference(df):
    return df.iloc[-1] - df.iloc[0]

sensors = [i for i in train_df.columns if 'sensor_' in i]

first_last_tr = train_df.groupby(['subject','sequence'])[sensors].apply(first_last_difference)
first_last_ts = test_df.groupby(['subject','sequence'])[sensors].apply(first_last_difference)



In [None]:
first_last_tr.columns = [i.replace('sensor','fl') for i in sensors]
first_last_ts.columns = [i.replace('sensor','fl') for i in sensors]

train_df = train_df.merge(first_last_tr,how='left',on=['sequence','subject'])
test_df = test_df.merge(first_last_tr,how='left',on=['sequence','subject'])

In [None]:
first_last_tr

In [None]:
train_df = train_df.groupby('sequence').agg(['mean','min','max','std',lambda x:scipy.stats.kurtosis(x)]).rename(columns={'<lambda_0>':'kurt'})

In [None]:
test_df = test_df.groupby('sequence').agg(['mean','min','max','std',lambda x:scipy.stats.kurtosis(x)]).rename(columns={'<lambda_0>':'kurt'})

In [None]:
pd.set_option("display.max_columns", None)
train_df

# Feature Selection

In [None]:
FEATURES = [
            (  'subject',  'min'),
            ('sensor_00', 'mean'),
            ('sensor_00',  'min'),
            ('sensor_00',  'max'),
            ('sensor_00',  'std'),
            ('sensor_00',  'kurt'),
            ('sensor_01', 'mean'),
            ('sensor_01',  'min'),
            ('sensor_01',  'max'),
            ('sensor_01',  'std'),
            ('sensor_01',  'kurt'),
            ('sensor_02', 'mean'),
            ('sensor_02',  'min'),
            ('sensor_02',  'max'),
            ('sensor_02',  'std'),
            ('sensor_02',  'kurt'),
            ('sensor_03', 'mean'),
            ('sensor_03',  'min'),
            ('sensor_03',  'max'),
            ('sensor_03',  'std'),
            ('sensor_03',  'kurt'),
            ('sensor_04', 'mean'),
            ('sensor_04',  'min'),
            ('sensor_04',  'max'),
            ('sensor_04',  'std'),
            ('sensor_04',  'kurt'),
            ('sensor_05', 'mean'),
            ('sensor_05',  'min'),
            ('sensor_05',  'max'),
            ('sensor_05',  'std'),
            ('sensor_05',  'kurt'),
            ('sensor_06', 'mean'),
            ('sensor_06',  'min'),
            ('sensor_06',  'max'),
            ('sensor_06',  'std'),
            ('sensor_06',  'kurt'),
            ('sensor_07', 'mean'),
            ('sensor_07',  'min'),
            ('sensor_07',  'max'),
            ('sensor_07',  'std'),
            ('sensor_07',  'kurt'),
            ('sensor_08', 'mean'),
            ('sensor_08',  'min'),
            ('sensor_08',  'max'),
            ('sensor_08',  'std'),
            ('sensor_08',  'kurt'),
            ('sensor_09', 'mean'),
            ('sensor_09',  'min'),
            ('sensor_09',  'max'),
            ('sensor_09',  'std'),
            ('sensor_09',  'kurt'),
            ('sensor_10', 'mean'),
            ('sensor_10',  'min'),
            ('sensor_10',  'max'),
            ('sensor_10',  'std'),
            ('sensor_10',  'kurt'),
            ('sensor_11', 'mean'),
            ('sensor_11',  'min'),
            ('sensor_11',  'max'),
            ('sensor_11',  'std'),
            ('sensor_11',  'kurt'),
            ('sensor_12', 'mean'),
            ('sensor_12',  'min'),
            ('sensor_12',  'max'),
            ('sensor_12',  'std'),
            ('sensor_12',  'kurt'),
            ('sensor_02_diff','min'),
            ('sensor_02_diff','max'),
            ('sensor_02_diff','std'),
            ('sensor_02_diff','kurt'),
            ('sensor_02_up','min'),
            ('sensor_02_down','min'),
            ('sensor_02_stay','min'),
            ('sensor_total','min'),
            ('sensor_total','max'),
            ('sensor_std','min'),
            ('sensor_std','max'),
            ('count', 'min'),
            ('fl_01','min'),
            ('fl_02','min'),
            ('fl_03','min'),
            ('fl_04','min'),
            ('fl_05','min'),
            ('fl_06','min'),
            ('fl_07','min'),
            ('fl_08','min'),
            ('fl_09','min'),
            ('fl_10','min'),
            ('fl_11','min'),
            ('fl_12','min'),
            ]
TARGET = [('state', 'mean')]

In [None]:
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score,accuracy_score,log_loss
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
X_train,X_test,y_train,y_test = train_test_split(train_df[FEATURES],train_df[TARGET],test_size=0.2,random_state=4)

# Hyper Parameter Tuning

In [None]:
from functools import partial
import optuna
import warnings
warnings.filterwarnings('ignore')

def objective(trial,X,y, name='xgb'):
    params = param = {
        'objective':'binary:logistic',
        'tree_method':'gpu_hist',  
        'lambda': trial.suggest_loguniform(
            'lambda', 1e-3, 10.0
        ),
        'alpha': trial.suggest_loguniform(
            'alpha', 1e-3, 10.0
        ),
        'colsample_bytree': trial.suggest_categorical(
            'colsample_bytree', [0.5,0.6,0.7,0.8,0.9,1.0]
        ),
        'subsample': trial.suggest_categorical(
            'subsample', [0.6,0.7,0.8,1.0]
        ),
        'learning_rate': trial.suggest_categorical(
            'learning_rate', [0.2,0.1, 0.05,0.02, 0.01]
        ),
        'n_estimators': trial.suggest_categorical(
            "n_estimators", [500, 800, 1000, 1500,1800]
        ),
        'max_depth': trial.suggest_categorical(
            'max_depth', [4,5,7,9,11,13,15,17]
        ),
        'random_state': 42,
        'min_child_weight': trial.suggest_int(
            'min_child_weight', 1, 300
        ),
        'eval_metric':'auc',
        
    }

    model =  XGBClassifier(**params)
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=50,verbose=False)


    train_score = np.round(roc_auc_score(y_train, model.predict_proba(X_train)[:,1]), 5)
    test_score = np.round(roc_auc_score(y_test, model.predict_proba(X_test)[:,1]), 5)
                  
    print(f'TRAIN ROC : {train_score} || TEST ROC : {test_score}')
                  
    return test_score

In [None]:
#%%time
#optimize = partial(objective,X=X_train,y=y_train)

#study_xgb = optuna.create_study(direction ='maximize')
#study_xgb.optimize(optimize,n_trials=150)


#Trial 114 finished with value: 0.94771 and parameters: {'lambda': 0.018573716089808828, 'alpha': 0.003257281674982165, 'colsample_bytree': 0.5, 'subsample': 1.0, 'learning_rate': 0.02, 'n_estimators': 1500, 'max_depth': 13, 'min_child_weight': 10}.
#Best is trial 114 with value: 0.94771.

In [None]:
parameters = {'tree_method':'gpu_hist',  'lambda': 0.018573716089808828, 'alpha': 0.003257281674982165, 'colsample_bytree': 0.5, 'subsample': 1.0,\
              'learning_rate': 0.02, 'n_estimators': 1500, 'max_depth': 13, 'min_child_weight': 10}

In [None]:
model =  XGBClassifier(**parameters)
model.fit(X_train,y_train)

y_pred = model.predict_proba(X_test)

roc_auc_score(y_test,y_pred[:,1])

In [None]:
from sklearn.model_selection import GroupKFold

predictions =  [ ]
scores = [ ]

group_kfold = GroupKFold(n_splits=7)
for fold, (train_index, test_index), in enumerate(group_kfold.split(train_df,groups = train_df[('subject','min')].values)):
    X_train,X_test = train_df.iloc[train_index][FEATURES],train_df.iloc[test_index][FEATURES]
    y_train, y_test = train_df.iloc[train_index][TARGET],train_df.iloc[test_index][TARGET]
    
    model =  XGBClassifier(**parameters)
    model.fit(X_train,y_train)

    y_pred = model.predict_proba(X_test)
     
    score = roc_auc_score(y_test,y_pred[:,1])
    scores.append(score)
    print(fold," FOLD ROC_AUC :",score)
    
    predictions.append(model.predict_proba(test_df[FEATURES])[:,1])
    
print("Average Score :", np.mean(scores))

In [None]:
ls = []
ls.append(predictions[1])
ls.append(predictions[2])
ls.append(predictions[5])
ls.append(predictions[6])

In [None]:
predictions = np.average(ls,axis=0)

# Making a submission File

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv',index_col='sequence')
sub['state']=predictions

In [None]:
sub.to_csv('submission.csv')