Let's start by importing the necessary modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import statsmodels.formula.api as smf

In [None]:
train_x = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
train_y = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")


In [None]:
print(f'Unique no of sequences {train_x.sequence.nunique()}\nUnique no of subject {train_x.subject.nunique()}')

There arent any sequences that are repeated. But the subjects are only 672. It means subjects are being repeated. Same subject might have undergone repeated examination. 

Lets create a function to apply the Fast Fourier Transform on the 60 seconds time series data from the sensors. Using the FFT we will try to get the frequencies of the time series and we use those frequencies has the features representing the sensors.

In [None]:
def fft_sensor_freq(series):
    n =  len(series)
    g = np.fft.fft(series,n)
    psd = g*np.conj(g)/n
    L = np.arange(1, np.floor(n/2), dtype = "int")
    freq = (1/(1*n)) * np.arange(n)
    freq1 = freq[list(psd[L].real).index(psd[L].real.max())]
    
    return freq1
    

In [None]:
df1 = pd.DataFrame(columns =  ['sequence','sensor_00', 'sensor_01', 'sensor_02',
       'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07',
       'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12'])
for i in train_x.sequence.unique():
    df = train_x.drop(columns = ["subject","step"]).query(f'sequence == {i}')
    df2 = dict()
    df2["sequence"] = [i]
    cols = ['sensor_00', 'sensor_01', 'sensor_02',
       'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07',
       'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']
    for j in cols:
        df2[j] = [fft_sensor_freq(df[j])]
    df1 = df1.append(pd.DataFrame(df2))
    #if (i % 200 == 0):
        #print(i)
    

In [None]:
final_train = df1.merge(train_y, on = "sequence", how = "inner")

Checking the Correlation matrix in order to see the correlations between the any of the sensor frequencies.

In [None]:
sns.heatmap(final_train.corr())

In [None]:
sns.displot(data = final_train, x = "sensor_12", hue = "state", kind = "kde")

In [None]:
model = linear_model.LogisticRegression()
xtrain, xtest, ytrain, ytest = train_test_split(final_train.drop(columns = ["sequence","state"]), final_train.state, test_size = 0.35)
model.fit(xtrain, ytrain)
proba = model.predict_proba(xtest)
proba_1 = [a[1] for a in proba]
for i in range(0,10,1):
    some1 = np.array(proba_1) > i/10
    print(i, sum(some1 == ytest)/len(ytest))

In [None]:
formula = 'ytrain~'+"+".join(list(xtrain.columns))
model = smf.glm(formula, data = xtrain).fit()
model.summary()

As we can see p-values of the sensors 3,6,7,are more than 0.05 hence lets try the model by removing these variables

In [None]:
formula = 'ytrain~'+"+".join(list(xtrain.drop(columns = ["sensor_03", "sensor_06", "sensor_07"]).columns))
model = smf.glm(formula, data = xtrain).fit()
model.summary()

There isnt much change in the model statistics even ofter omitting the few of the sensors.Lets once try predicting the reuslts and see how the accuracy is.

In [None]:
proba = model.predict(xtest) 
#remember we will be getting the probabilities from glm model

for i in range(40,60,1):
    print(i, sum(ytest == (proba > i/100))/len(ytest))

A probability cut off of 0.48 is giving an accuracy of 56.85 but still this isnt that significant. As we have seen in the distibution plots above the frequences are almost following in same distribution for both the states. Lets apply LDA for dimensionality reduction and see how the results comes.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
model = LDA(n_components = 1)
xtrain_lda = model.fit_transform(xtrain, ytrain)


In [None]:
sns.histplot(xtrain_lda)

LDA as dimensionality reduction isnt helpful since the transformed data as single peak and isnt able to separate the two classes.

Lets use the logistic regression with a cut off of 0.48 for predictions as of now

In [None]:
df1 = pd.DataFrame(columns =  ['sequence','sensor_00', 'sensor_01', 'sensor_02',
       'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07',
       'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12'])
for i in test.sequence.unique():
    df = test.drop(columns = ["subject","step"]).query(f'sequence == {i}')
    df2 = dict()
    df2["sequence"] = [i]
    cols = ['sensor_00', 'sensor_01', 'sensor_02',
       'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07',
       'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']
    for j in cols:
        df2[j] = [fft_sensor_freq(df[j])]
    df1 = df1.append(pd.DataFrame(df2))
    #if (i % 200 == 0):
        #print(i)
    

In [None]:
test = df1.copy()

In [None]:
model1 = linear_model.LogisticRegression()
model1.fit(final_train.drop(columns = ["sequence","state"]), final_train.state)
proba = model.predict_proba(test.drop(columns = "sequence"))
proba_test1 = [a[1] for a in proba ]

In [None]:
preds = np.array(proba_test1) > 0.48
sub = pd.DataFrame(zip(test.sequence, preds), columns = ["sequence", "state"])
sub.state = sub.state.astype("category").cat.codes
sub.to_csv("Sub.csv") 
##Accuracy of 0.51