In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')

# Feature Engineering

In [None]:
def transform_train_data(train,labels):
    train = train.merge(labels, left_on=['sequence'], right_index=True)
    train = train.drop(['sequence_x','sequence_y','step'],axis=1)
    
    df_train = train.copy()
    df_train = train.groupby('sequence').mean()
    df_train['state'] = df_train['state'].astype(int)
    df_train['subject'] = df_train['subject'].astype(int)
    
    maxs = train.groupby('sequence').max()
    mins = train.groupby('sequence').min()
    
    df_train = df_train.merge(maxs,left_on=['sequence'],right_index=True)
    df_train = df_train.merge(mins,left_on=['sequence'],right_index=True)
    
    df_train = df_train.rename(columns={'sensor_00_x':'sensor_00_mean','sensor_01_x':'sensor_01_mean','sensor_02_x':'sensor02_mean',
                                       'sensor_03_x':'sensor_03_mean','sensor_04_x':'sensor_04_mean','sensor_05_x':'sensor_05_mean',
                                       'sensor_06_x':'sensor_06_mean','sensor_07_x':'sensor_07_mean','sensor_08_x':'sensor_08_mean',
                                       'sensor_09_x':'sensor_09_mean','sensor_10_x':'sensor_10_mean','sensor_11_x':'sensor_11_mean',
                                       'sensor_12_x':'sensor_12_mean','sensor_00_y':'sensor_0_max','sensor_01_y':'sensor_1_max','sensor_02_y':'sensor_2_max',
                                       'sensor_03_y':'sensor_3_max','sensor_04_y':'sensor_4_max','sensor_05_y':'sensor_5_max',
                                       'sensor_06_y':'sensor_6_max','sensor_07_y':'sensor_7_max','sensor_08_y':'sensor_8_max',
                                       'sensor_09_y':'sensor_9_max','sensor_10_y':'sensor_10_max','sensor_11_y':'sensor_11_max',
                                       'sensor_12_y':'sensor_12_max','sensor_00':'sensor_0_min','sensor_01':'sensor_1_min','sensor_02':'sensor_2_min',
                                       'sensor_03':'sensor_3_min','sensor_04':'sensor_4_min','sensor_05':'sensor_5_min',
                                       'sensor_06':'sensor_6_min','sensor_07':'sensor_7_min','sensor_08':'sensor_8_min',
                                       'sensor_09':'sensor_9_min','sensor_10':'sensor_10_min','sensor_11':'sensor_11_min',
                                       'sensor_12':'sensor_12_min'})
    
    #y_train = df_train['state']
    df_train = df_train.drop(['state_x','state_y','subject_x','subject_y'],axis=1)
    
    x = df_train['subject'].value_counts()
    subs = np.array(df_train['subject'])
    new_subs = []
    for i in range(len(subs)):
        if x[subs[i]] >= 50:
            new_subs.append(1)
        else:
            new_subs.append(0)
            
    constant_sensor = []
    for j in range(len(df_train)):
        for k in range(12):
            sensor_min = f'sensor_{k}_min'
            sensor_max = f'sensor_{k}_max'
            if df_train[sensor_min][j] == df_train[sensor_max][j]:
                value = 1
                break
            else:
                value = 0
                
        constant_sensor.append(value)
                
    df_train['subject_freqent'] = new_subs
    df_train['constant_sensor'] = constant_sensor
    
    df_train = df_train.drop('subject',axis=1)
    
    return df_train

In [None]:
def transform_test_data(test):
    test = test.drop(['step'],axis=1)
    df_test = test.copy()
    
    df_test = test.groupby('sequence').mean()
    df_test['subject'] = df_test['subject'].astype(int)
    
    test_max = test.groupby('sequence').max()
    test_min = test.groupby('sequence').min()
    
    df_test = df_test.merge(test_max,left_on=['sequence'],right_index=True)
    df_test = df_test.merge(test_min,left_on=['sequence'],right_index=True)
    
    df_test = df_test.rename(columns={'sensor_00_x':'sensor_00_mean','sensor_01_x':'sensor_01_mean','sensor_02_x':'sensor02_mean',
                                       'sensor_03_x':'sensor_03_mean','sensor_04_x':'sensor_04_mean','sensor_05_x':'sensor_05_mean',
                                       'sensor_06_x':'sensor_06_mean','sensor_07_x':'sensor_07_mean','sensor_08_x':'sensor_08_mean',
                                       'sensor_09_x':'sensor_09_mean','sensor_10_x':'sensor_10_mean','sensor_11_x':'sensor_11_mean',
                                       'sensor_12_x':'sensor_12_mean','sensor_00_y':'sensor_0_max','sensor_01_y':'sensor_1_max','sensor_02_y':'sensor_2_max',
                                       'sensor_03_y':'sensor_3_max','sensor_04_y':'sensor_4_max','sensor_05_y':'sensor_5_max',
                                       'sensor_06_y':'sensor_6_max','sensor_07_y':'sensor_7_max','sensor_08_y':'sensor_8_max',
                                       'sensor_09_y':'sensor_9_max','sensor_10_y':'sensor_10_max','sensor_11_y':'sensor_11_max',
                                       'sensor_12_y':'sensor_12_max','sensor_00':'sensor_0_min','sensor_01':'sensor_1_min','sensor_02':'sensor_2_min',
                                       'sensor_03':'sensor_3_min','sensor_04':'sensor_4_min','sensor_05':'sensor_5_min',
                                       'sensor_06':'sensor_6_min','sensor_07':'sensor_7_min','sensor_08':'sensor_8_min',
                                       'sensor_09':'sensor_9_min','sensor_10':'sensor_10_min','sensor_11':'sensor_11_min',
                                       'sensor_12':'sensor_12_min'})
    
    df_test = df_test.drop(['subject_x','subject_y'],axis=1)
    
    x = df_test['subject'].value_counts()
    subs = np.array(df_test['subject'])
    new_subs = []
    for i in range(len(subs)):
        if x[subs[i]] >= 50:
            new_subs.append(1)
        else:
            new_subs.append(0)
            
    constant_sensor = []
    for j in range(25968,38186):
        for k in range(12):
            sensor_min = f'sensor_{k}_min'
            sensor_max = f'sensor_{k}_max'
            if df_test[sensor_min][j] == df_test[sensor_max][j]:
                value = 1
                break
            else:
                value = 0
        constant_sensor.append(value)
        
    df_test['subject_freqent'] = new_subs
    df_test['constant_sensor'] = constant_sensor
    
    df_test = df_test.drop('subject',axis=1)
                
    return df_test

In [None]:
train_df = transform_train_data(train,labels)
test_df = transform_test_data(test)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
y_train = train_df['state']
train_df = train_df.drop('state',axis=1)

# Boosted Trees and Random Forest Models

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
def get_best_model(train,y_train):
    rf = RandomForestClassifier(n_estimators=5000)
    cb = CatBoostClassifier(logging_level='Silent')
    ab = AdaBoostClassifier(n_estimators=5000)
    xb = XGBClassifier()
   
    scores_rf = cross_val_score(rf, train, y_train, cv=5, scoring='roc_auc')
    scores_cb = cross_val_score(cb, train, y_train, cv=5, scoring='roc_auc')
    scores_ab = cross_val_score(ab, train, y_train, cv=5, scoring='roc_auc')
    scores_xb = cross_val_score(xb, train, y_train, cv=5, scoring='roc_auc')
    
    rfs = np.mean(scores_rf)
    cbs = np.mean(scores_cb)
    ab_s = np.mean(scores_ab)
    xbs = np.mean(scores_xb)
    
    return f'Random Forest:{rfs}, CatBoost:{cbs}, AdaBoost:{ab_s}, XGBoost:{xbs}'

In [None]:
#get_best_model(df_train,y_train)

In [None]:
def get_params(df):
    mini = min(df['mean_test_score'])
    for row in range(len(df)):
        if df.iloc[row]['mean_test_score'] == mini:
            return df.iloc[row]['params']

In [None]:
#xb = GridSearchCV(XGBClassifier(n_estimators=1000,verbosity=0),{
#    'learning_rate' : [1,1.5,2],
#    'gamma' : [0.5,1,1.5],
#    'reg_lambda' : [0.5,1,1.5]
    
#}, scoring ='roc_auc',return_train_score=False)

#xb.fit(train_df,y_train,verbose=None)

#new_df_xb = pd.DataFrame(xb.cv_results_)

In [None]:
cat = GridSearchCV(CatBoostClassifier(iterations=1000,verbose=0),{
    'depth' : [2,3,4,5],
    'learning_rate' : [0.01,0.1,0.2,0.5],
    'random_strength' : [1.0,2.0,3.0,4.0],
    'min_data_in_leaf' : [2,3,4,5]
    
}, scoring = 'roc_auc',return_train_score=False)

cat.fit(train_df,y_train,verbose=None)
new_df_cat = pd.DataFrame(cat.cv_results_)

In [None]:
params = get_params(new_df_cat)
params

In [None]:
learning_rate = params['learning_rate']
depth = params['depth']
random_strength = params['random_strength']
min_data_in_leaf = params['min_data_in_leaf']

model = CatBoostClassifier(iterations=8000,
                          verbose=0,
                          depth=depth,
                          learning_rate=learning_rate,
                          random_strength=random_strength,
                          min_data_in_leaf=min_data_in_leaf
                          )


model.fit(train_df,y_train)
predictions = model.predict(test_df)

In [None]:
sample_sub['state'] = predictions

In [None]:
sample_sub.to_csv('submission.csv',index=False)