# Tabular Playground Series - April 2022

![](https://storage.googleapis.com/kaggle-competitions/kaggle/33104/logos/header.png)

## **Steps:**
* Objectives
* Data Understanding
* Data Preprocessing
* Model Generation
* Optimization
* Predictions and Submission

# **Objectives**
* Objective is to determine what state a participant was in from the sensor data, a time series classification problem.

# **Data Understanding**

* Sixty-second sequences of biological sensor data recorded from several hundred participants who could have been in either of two possible activity states

**Files and Field Descriptions**

* train.csv - the training set, comprising ~26,000 60-second recordings of thirteen biological sensors for almost one thousand experimental participants
    1. sequence - a unique id for each sequence
    2. subject - a unique id for the subject in the experiment
    3. step - time step of the recording, in one second intervals
    4. sensor_00 - sensor_12 - the value for each of the thirteen sensors at that time step


* train_labels.csv - the class label for each sequence.
    1. sequence - the unique id for each sequence.
    2. state - the state associated to each sequence. This is the target which you are trying to predict.
    

* test.csv - the test set. For each of the ~12,000 sequences, you should predict a value for that sequence's state.
* sample_submission.csv - a sample submission file in the correct format.

In [None]:
# Importing necessary libraries

import numpy as np
import pandas as pd
import random
import os

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score,f1_score,roc_auc_score

import optuna

from warnings import simplefilter
simplefilter("ignore")

print("Imported Necessary Libraries")

In [None]:
# Reading file names
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Converting files to data frames
train_labels_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')

In [None]:
# Sample of train_labels data
train_labels_data.sample(5)

In [None]:
# Sample of train
train_data.sample(5)

In [None]:
# test data sample
test_data.sample(5)

In [None]:
# data shapes
print(f'Train labels data shape: {train_labels_data.shape}')
print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')

In [None]:
# Train data info
train_data.info()

In [None]:
# Memory reduction of features
train_data[train_data.select_dtypes(np.float64).columns] = train_data.select_dtypes(np.float64).astype(np.float32)
train_data.info()

In [None]:
# Basic Statistics of the train data
train_data.describe().T

In [None]:
# Label chart
plt.figure(figsize=(6,5))
ax = sns.countplot(data=train_labels_data,x='state')
ax.bar_label(ax.containers[0])
plt.show()

The target classes/labels are balanced as there is minor difference between them

In [None]:
SEED = 5 # Seed value for reproducing he same data

In [None]:
# Box Plot

random.seed(SEED) # Set the seed for reproducibilty
random_sequence = random.randint(train_data['sequence'].min(), train_data['sequence'].max())

df = train_data[train_data['sequence']==random_sequence]

SENSOR_COUNT = 13 # Thirteen sensors used for measurements
subject_number = df['subject'].unique()[0] # Subject numbers seems unique for particlar sequence

plt.figure(figsize=(16,14))
for i in range(SENSOR_COUNT):
    plt.subplot(6,3,i+1)
    sensor = 'sensor_'+str(i).zfill(2)
    sns.boxplot(data=df,y=sensor)
plt.suptitle(f'Box Plots of Sensors:{random_sequence} and Subject: {subject_number}')
plt.show()

In [None]:
# KDE Plots

random.seed(SEED) # Set the seed for reproducibilty
random_sequence = random.randint(train_data['sequence'].min(), train_data['sequence'].max())

df = train_data[train_data['sequence']==random_sequence]

SENSOR_COUNT = 13 # Thirteen sensors used for measurements
subject_number = df['subject'].unique()[0] # Subject numbers seems unique for particlar sequence

plt.figure(figsize=(16,24))
for i in range(SENSOR_COUNT):
    plt.subplot(6,3,i+1,aspect='auto')
    sensor = 'sensor_'+str(i).zfill(2)
    sns.kdeplot(data=df,x=sensor)
plt.suptitle(f'Kde Plots of Sensors:{random_sequence} and Subject: {subject_number}')
plt.show()

In [None]:
# Time series chart for a particular sequence

random.seed(SEED) # Set the seed for reproducibilty
random_sequence = random.randint(train_data['sequence'].min(), train_data['sequence'].max())

df = train_data[train_data['sequence']==random_sequence]

SENSOR_COUNT = 13 # Thirteen sensors used for measurements
subject_number = df['subject'].unique()[0] # Subject numbers seems unique for particlar sequence

plt.figure(figsize=(16,12))
for i in range(SENSOR_COUNT):
    plt.subplot(6,3,i+1)
    sensor = 'sensor_'+str(i).zfill(2)
    sns.lineplot(data=df,x='step',y=sensor)
plt.suptitle(f'Time Series Chart of Sensors for Sequence:{random_sequence} and Subject: {subject_number}')
plt.show()

In [None]:
# Time series chart for 5 random sequences

random.seed(SEED) # Set the seed for reproducibilty
random_sequence = [] # Empty list for random sequence

for i in range(5):
    random_sequence.append(random.randint(train_data['sequence'].min(), train_data['sequence'].max()))

df = pd.DataFrame(train_data[train_data['sequence'].isin(random_sequence)])

SENSOR_COUNT = 13 # Thirteen sensors used for measurements

plt.figure(figsize=(26,24))

index=0
for i in range(SENSOR_COUNT):   
    for j,sequence in enumerate(random_sequence):
        sensor = 'sensor_'+str(i).zfill(2)
        plt.subplot(13,5,index+j+1)
        sns.lineplot(data=df[df['sequence']==sequence],x='step',y=sensor)
    index=index+5

plt.suptitle(f'Time Series Chart of Sensors for Sequence:{random_sequence}')
plt.show()

My understanding from the charts are that there is no particular relationship exits across sensors and seems they are independent in nature.

In [None]:
# Scatter plot between Subject and Sensor values
plt.figure(figsize=(16,12))
for i in range(SENSOR_COUNT):
    sensor = 'sensor_'+str(i).zfill(2)
    plt.subplot(6,3,i+1)
    sns.scatterplot(x=train_data.subject,y=train_data[sensor],hue=train_labels_data.state,palette='Dark2')
plt.suptitle('Scatter Plot between Subject and Sensor Values')
plt.show()

In [None]:
# Scatter plot between Sequence and Sensor values
plt.figure(figsize=(16,12))
for i in range(SENSOR_COUNT):
    sensor = 'sensor_'+str(i).zfill(2)
    plt.subplot(6,3,i+1)
    sns.scatterplot(x=train_data.sequence,y=train_data[sensor],hue=train_labels_data.state,palette='Dark2')
plt.suptitle('Scatter Plot between Sequence and Sensor Values')
plt.show()

* Sensor_04, Sensor_05, Sensor_10 and Sensor_12 values seems to be scattered more , compared to others 

# **Data Preprocessing**

In [None]:
# Checking for missing values
train_data.isna().sum()

In [None]:
# transform the values with grouping strategy
agg_strategy = ['mean','median','sum','max','var']

In [None]:
# Grouping the data thorugh 'sequence' and 'subject'
group_df_train = train_data.groupby(['sequence','subject']).agg(agg_strategy)
group_df_train.columns = ["_".join(x) for x in group_df_train.columns.ravel()]
group_df_train = group_df_train.reset_index()
group_df_train.head()

In [None]:
# features to drop
cols_to_drop = ['sequence','subject']
for col in group_df_train.columns:
    if 'step' in col:
        cols_to_drop.append(col)
cols_to_drop

In [None]:
# Final features and target
X = group_df_train.drop(cols_to_drop,axis=1)
y = train_labels_data['state']

In [None]:
# Converting test data to group form
group_df_test = test_data.groupby(['sequence','subject']).agg(agg_strategy)
group_df_test.columns = ["_".join(x) for x in group_df_test.columns.ravel()]
group_df_test = group_df_test.reset_index()

X_test_sequence = group_df_test.sequence # preserving sequence for adding in submission file
X_test = group_df_test.drop(cols_to_drop,axis=1)

In [None]:
# Splitting the data to train and test data
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=42,shuffle=False)

In [None]:
# Shapes of train and test data
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_valid shape: {X_valid.shape}')
print(f'y_valid shape: {y_valid.shape}')
print(f'X_test shape: {X_test.shape}')

# **Model Generation**

In [None]:
# creating a function for model generation
def model_gen(model,model_name,X_train,X_valid,y_train,y_valid):
    
    mod = model
    mod.fit(X_train,y_train)
    
    y_pred_train = mod.predict(X_train)
    y_pred_valid = mod.predict(X_valid)
    y_pred_train_prob = mod.predict_proba(X_train)
    y_pred_valid_prob = mod.predict_proba(X_valid)
    
    score = cross_val_score(mod,X_train,y_train,cv=5,scoring='roc_auc')
    
    print(f'Model: {model_name}')
    print('Training Data Scores:')
    print(f"Train data accuracy score: {round(accuracy_score(y_train,y_pred_train),4)}")
    print(f"Train data f1 score: {round(f1_score(y_train,y_pred_train),4)}")
    print(f"Area under the ROC curve for Train data Probability Predictions: {round(roc_auc_score(y_train,y_pred_train_prob[:,1]),4)}")
    
    print('\n')
    print('Validation Data Scores:')
    print(f"Validation data accuracy score: {round(accuracy_score(y_valid,y_pred_valid),4)}")
    print(f"Validation data f1 score: {round(f1_score(y_valid,y_pred_valid),4)}")
    print(f"Area under the ROC curve for Validation data Probability Predictions: {round(roc_auc_score(y_valid,y_pred_valid_prob[:,1]),4)}")
   
    print('\n')
    print('Cross Validation Scores on metric roc_auc:')
    print(f'Mean value of scores: {round(np.mean(score),4)}')
    print(f'Standard Deviation of scores: {round(np.std(score),4)}')
    print('\n********************************************************************************\n')

In [None]:
# Getting model scores of Random Forest, XGB and LGBM
models = {'Random Forest Classifier':RandomForestClassifier(random_state=42),'XGB Classifier':XGBClassifier(random_state=42,verbosity=0),'LGBM Classifier':LGBMClassifier(random_state=42)}

for model_name,model in models.items():
    model_name = model_gen(model=model,model_name=model_name,X_train=X_train,X_valid=X_valid,y_train=y_train,y_valid=y_valid)

Using LGBM classifier for further tuning, as it seems not overfitting.

# **Optimization**

In [None]:
# Function for creating study
def train_model_for_study(model,X,y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,\
                                                          random_state=42,shuffle=False)
    model.fit(X_train, 
        y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)], 
        verbose=-1
    )
    
    y_pred_prob = model.predict_proba(X_valid)
    return roc_auc_score(y_valid,y_pred_prob[:,1])

In [None]:
# Objective function
def objective_lgbc(trial):
    
    params = {
        'n_estimators': trial.suggest_int("n_estimators", 100, 10000),
        'num_leaves':trial.suggest_int("num_leaves",25,100),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 1.0, log=True),
        'max_depth': trial.suggest_int("max_depth", 2, 15),
        'min_child_samples':trial.suggest_int("min_child_samples",10,50),
        'n_jobs':trial.suggest_int("n_jobs",1,10)
    }
    
    model = LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        random_state=42,
    )
    return train_model_for_study(model,X,y)

In [None]:
# create study
sampler = optuna.samplers.TPESampler(seed=1)
study_lgbc = optuna.create_study(direction="maximize",sampler = sampler,study_name='LGBC Optimizer')
study_lgbc.optimize(objective_lgbc, n_trials=10)
study_lgbc.best_params

# **Predictions and Submission**

In [None]:
# Final model with optimized parameters (LGBMC)
final_model_lgbc = LGBMClassifier(boosting_type='gbdt',objective='binary',n_estimators=4228,
                                  num_leaves=79,learning_rate=0.010005268542378308,\
                                  max_depth=6,min_child_samples=16,n_jobs=1)

final_model_lgbc.fit(X,y,verbose=-1)

#Final prediction probabilities
y_pred_test_p_lgbc = final_model_lgbc.predict_proba(X_test)

In [None]:
# Feature Importance plot
plt.figure(figsize=(16,20))
sns.barplot(y=X_train.columns,x=final_model_lgbc.feature_importances_)
plt.title('Feature Importance Chart')
plt.show()

In [None]:
# Submission file
submission = pd.DataFrame({'sequence':X_test_sequence,'state':y_pred_test_p_lgbc[:,1]})
submission.to_csv('submission.csv',index=False)

# **Thank You**