# Import libraries

In [None]:
import numpy as np 
import pandas as pd

# Import files

In [None]:
sample_sub = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv", index_col = "sequence")

**In this competition, we will predict states of each sequence.**

In [None]:
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv", index_col = ["sequence","step"])

In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv", index_col = ["sequence", "step"]) 

In [None]:
labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv", index_col = 0)

# Step1: delete "subject"

As you see above, "subject" dose not matter, so **delete** it.

In [None]:
test = test.drop(columns = ["subject"])
train = train.drop(columns = ["subject"])

# Step2: check the missing values.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

**There are no missing values in this datasets.**

# Step3: merge training data.

In [None]:
train = pd.merge(train, labels, on='sequence')

In [None]:
test = pd.merge(test, sample_sub, on='sequence')

# Step4: create features.

**I referred to https://www.kaggle.com/code/hasanbasriakcay/tpsapr22-fe-pseudo-labels-bi-lstm/notebook?scriptVersionId=92471730.**

In [None]:
def create_new_features(df):
    df['sensor_02_num'] = df['sensor_02'] > -15
    df['sensor_02_num'] = df['sensor_02_num'].astype(int)
    df['sensor_sum1'] = (df['sensor_00'] + df['sensor_09'] + df['sensor_06'] + df['sensor_01'])
    df['sensor_sum2'] = (df['sensor_01'] + df['sensor_11'] + df['sensor_09'] + df['sensor_06'] + df['sensor_00'])
    df['sensor_sum3'] = (df['sensor_03'] + df['sensor_11'] + df['sensor_07'])
    df['sensor_sum4'] = (df['sensor_04'] + df['sensor_10'])
    
    sensors = ['sensor_'+'%02d'%i for i in range(0, 13)]
    sensors.extend(['sensor_02_num', 'sensor_sum1', 'sensor_sum2', 'sensor_sum3', 'sensor_sum4'])
    
    for sensor in sensors:
        df[sensor + '_lag1'] = df.groupby('sequence')[sensor].shift(1)
        df.fillna(0, inplace=True)
        df[sensor + '_diff1'] = df[sensor] - df[sensor + '_lag1'] 
    
    return df

In [None]:
train = create_new_features(train)
test = create_new_features(test)

**in order to save the memory, I just use the firse 100000 rows.**

In [None]:
y_train = train['state'].head(100000)

In [None]:
X_train = train.drop('state', axis=1).head(100000)

In [None]:
X_test = test.drop("state", axis=1)

# Step5: Modeling

**In this notebook, I use random forest for modeling.**

**I gave up optuna because it uses too much memory, but if you cancel the commentout, you can use it.**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#from sklearn.model_selection import cross_val_score
#import optuna

In [None]:
#def objective(trial):
    #max_depth = trial.suggest_int('max_depth', 1, 1000)
    
    #clf = RandomForestClassifier(max_depth = max_depth, n_jobs=2)
    #score = cross_val_score(clf, X_train, y_train, cv=5, scoring="r2")
    #r2_mean = score.mean()
    #return r2_mean

In [None]:
#study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_jobs = 20)

In [None]:
#model = RandomForestClassifier(max_depth = study.best_params['max_depth'], n_jobs=2)

In [None]:
model = RandomForestClassifier(max_depth = 150, n_jobs=2) #if you use optuna, please comment out this cell.

In [None]:
model.fit(X_train,y_train)

# Step6: Predict and submit

In [None]:
y_pred = model.predict(X_test)

In [None]:
len(y_pred)

In [None]:
test["pred"] = y_pred

In [None]:
test2 = test.groupby("sequence").mean()

In [None]:
pred_mean = test2["pred"]

In [None]:
sample_sub2 = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")

In [None]:
sample_sub2['state'] = list(map(str, pred_mean))

In [None]:
sample_sub2.to_csv("submission.csv", index=False)

**Thank you for reading this notebook. Your comment and upvote will motivate me to write another notebook. I wiil appreciate it!!**