# Multi-class LDA

In [14]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal as m_norm
from IPython.display import display, clear_output #for HTML printing
import timeit

In [2]:
x_test_df = pd.read_csv("X_test.csv")
x_train_df = pd.read_csv("X_train.csv")
y_train_df = pd.read_csv("Y_train.csv")

#display(x_train_df.head())
#display(x_train_df.info())
#display(x_train_df.describe())
#display(y_train_df.head())

# creating a map {surface : numerical class}
class_namelist = y_train_df.surface.unique()
class_dict = {x:y for x,y in zip(range(0, len(class_namelist)), class_namelist)}
inv_class_dict = {v: k for k, v in class_dict.items()}

display(class_dict)
#display(inv_class_dict)

# adding 'surface' (response variable) to x_train dataframe and lets call it train_df
x_train_df = x_train_df.merge(y_train_df, on="series_id", how="left")
train_df = x_train_df    # don't worry about memory, its just a pointer


# adding 'class' (numerical class) to train_df
train_df['class'] = train_df['surface'].apply(lambda x: inv_class_dict[x]) 
display(train_df.tail())

{0: 'fine_concrete',
 1: 'concrete',
 2: 'soft_tiles',
 3: 'tiled',
 4: 'soft_pvc',
 5: 'hard_tiles_large_space',
 6: 'carpet',
 7: 'hard_tiles',
 8: 'wood'}

Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,group_id,surface,class
487675,3809_123,3809,123,0.62871,-0.76878,-0.084391,0.081093,0.003167,0.09376,-0.14274,3.2718,2.0115,-9.0063,56,soft_pvc,4
487676,3809_124,3809,124,0.62884,-0.76868,-0.084365,0.081099,0.014994,0.032637,-0.13238,4.4275,3.0696,-8.1257,56,soft_pvc,4
487677,3809_125,3809,125,0.62891,-0.76861,-0.084345,0.081178,-0.031184,-0.003961,-0.13894,2.7048,4.2622,-8.1443,56,soft_pvc,4
487678,3809_126,3809,126,0.62903,-0.7685,-0.084414,0.081231,-0.069153,0.013229,-0.13021,2.541,4.713,-9.4435,56,soft_pvc,4
487679,3809_127,3809,127,0.62915,-0.76839,-0.084441,0.081284,-0.042769,0.034049,-0.1258,0.82391,4.2751,-10.498,56,soft_pvc,4


In [3]:
# all features
feature_list = train_df.loc[:, 'orientation_X':'linear_acceleration_Z'].columns.values.tolist()

# x_train_df.mean(axis=0) # this is slower because of object column
# x_train_df._get_numeric_data().mean(axis=0) # either use this or better

# calculating class-wise sample means
sample_mean = [0]*len(feature_list)   # for each feature
sample_size = [0]*len(class_dict)  # size for each class
sample_cov_dfs = [0]*len(class_dict)  # covariance for each class ( each item will be a df )

sample_means = [sample_mean for i in range(0, len(class_dict))] # a list of arrays (each array is for each class)

# for each class in class_dict
for idx, surface_i in enumerate(class_dict.values()):
    temp_df = train_df.loc[train_df.surface == surface_i]
    sample_size_i = len(temp_df.index) # this gets the row count of the dataframe
    
    temp_df2 = temp_df.loc[:, 'orientation_X':'linear_acceleration_Z']
    
    sample_means[idx] = temp_df2.mean(axis=0).values # store mean into respective array
    sample_cov_dfs[idx] = temp_df2.cov()
    sample_size[idx] = sample_size_i # store sample size into a list
    

In [4]:
#display(sample_means)
sample_mean_dict = dict(zip(class_dict.values(), sample_means))
display(sample_mean_dict)
display(sample_size)

# compute prevalences
# P(Y = class) = sample_size_class/sample_size_training

sample_size_all = len(train_df.index)

prevalence = [class_size/sample_size_all for class_size in sample_size]
print(prevalence[0])
#display(prevalence) #should sum to 1.0

#display(sample_cov_dfs[0])
#display(sample_cov_dfs[1])
#df = pd.DataFrame()
#df = pd.concat([sample_cov_dfs[0], sample_cov_dfs[1]])
#display(df)

assert sum(sample_size) == len(train_df.index)

{'fine_concrete': array([-1.43290155e-01,  1.88151457e-01,  2.85006496e-02, -2.16831563e-02,
        -9.91504906e-05,  3.16736093e-03, -2.44271486e-03,  1.36297485e-01,
         2.92210215e+00, -9.35607003e+00]),
 'concrete': array([-2.07606983e-01,  7.89323534e-02,  1.03509340e-02, -3.21066444e-02,
         9.25171794e-04,  2.18164319e-02, -6.27181845e-02,  1.11593092e-01,
         2.90773270e+00, -9.35640755e+00]),
 'soft_tiles': array([ 2.76689163e-01,  1.77857122e-02,  4.52426477e-03,  4.20254501e-02,
         6.40250325e-04,  1.75021390e-02, -4.89125478e-02,  8.41187577e-02,
         2.91412359e+00, -9.35947358e+00]),
 'tiled': array([-2.75754295e-01,  7.50029355e-02,  1.08687175e-02, -4.20898577e-02,
         1.01947216e-04,  6.40349476e-03, -1.26683436e-02,  1.35559725e-01,
         2.91380343e+00, -9.35704482e+00]),
 'soft_pvc': array([ 3.09570136e-01, -1.14566104e-02,  5.43322975e-03,  4.33312302e-02,
         6.92125374e-04,  1.48694477e-02, -4.03870933e-02,  1.28838871e-01,


[46464, 99712, 38016, 65792, 93696, 39424, 24192, 2688, 77696]

0.09527559055118111


P(Y=class|x) = P(x|Y=class).P(Y=class)

P(x|Y=class) = normal dist. pdf(x|class parameters)

P(Y=class) = prevalence

The classifier is given by:

classifier(x) = argmax {P(Y=i|x)} (the arg being 'i', the class)

In [None]:
train_df["prediction"] = np.nan
display(train_df.shape)

l = len(list(train_df.itertuples()))
print(l)
start = timeit.default_timer()
for row in train_df.itertuples():
    clear_output(wait=True)
    #display(type(row[4:14]))
    #print(type(row))
    #print(type(train_df))
    x = np.array(row[4:14])
    #print(type(x))
    #display(x)
    #print(row[0])
    max_posterior_prob = -1
    arg_max = -1
    for class_num, class_name in enumerate(class_dict.values()):
        #print(type(class_name))
        #print(type(class_num))
        class_mean = sample_mean_dict[class_name]
        class_cov = sample_cov_dfs[class_num]
        pdf = m_norm.pdf(x, mean=class_mean, cov=class_cov)
        preval = prevalence[class_num]
        posterior_prob = pdf*preval
        arg = class_num
        if posterior_prob > max_posterior_prob: 
            max_posterior_prob = posterior_prob
            arg_max = arg
        
    #print(train_df.iloc[5,16])
    #print(row[0])
    #print(class_dict[arg_max])
    train_df.iloc[row[0], 16] = class_dict[arg_max]
    
    stop = timeit.default_timer()
    
    if (row[0]/l*100) < .00000000005 :
        expected_time = "Calculating..."
        
    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( ( (time_perc-start) / (row[0]/l) )/60, 2)

    print("Current progress:", np.round((row[0]/l)*100, 2), "%")
    print("Current run time:", np.round((stop - start)/60, 2), "minutes")
    print("Expected run time:", expected_time, "minutes")

Current progress: 3.29 %
Current run time: 5.83 minutes
Expected run time: 176.86 minutes


In [7]:
display(train_df)

Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,group_id,surface,class,prediction
0,0_0,0,0,-0.75853,-0.63435,-0.104880,-0.105970,0.107650,0.017561,0.000767,-0.748570,2.103000,-9.7532,13,fine_concrete,0,fine_concrete
1,0_1,0,1,-0.75853,-0.63434,-0.104900,-0.106000,0.067851,0.029939,0.003385,0.339950,1.506400,-9.4128,13,fine_concrete,0,fine_concrete
2,0_2,0,2,-0.75853,-0.63435,-0.104920,-0.105970,0.007275,0.028934,-0.005978,-0.264290,1.592200,-8.7267,13,fine_concrete,0,fine_concrete
3,0_3,0,3,-0.75852,-0.63436,-0.104950,-0.105970,-0.013053,0.019448,-0.008974,0.426840,1.099300,-10.0960,13,fine_concrete,0,fine_concrete
4,0_4,0,4,-0.75852,-0.63435,-0.104950,-0.105960,0.005135,0.007652,0.005245,-0.509690,1.468900,-10.4410,13,fine_concrete,0,fine_concrete
5,0_5,0,5,-0.75853,-0.63439,-0.104830,-0.105800,0.059664,0.013043,-0.013231,-0.447450,0.992810,-10.4020,13,fine_concrete,0,fine_concrete
6,0_6,0,6,-0.75853,-0.63441,-0.104810,-0.105690,0.082140,0.044356,-0.002696,-0.141630,0.734970,-9.4296,13,fine_concrete,0,fine_concrete
7,0_7,0,7,-0.75852,-0.63444,-0.104800,-0.105610,0.056218,0.038162,-0.022931,-0.121600,0.075417,-8.6088,13,fine_concrete,0,fine_concrete
8,0_8,0,8,-0.75851,-0.63445,-0.104850,-0.105590,-0.012846,0.039004,-0.007831,1.600000,0.816110,-7.6426,13,fine_concrete,0,fine_concrete
9,0_9,0,9,-0.75851,-0.63443,-0.104890,-0.105670,-0.090082,0.027299,-0.009970,0.474960,0.909600,-8.8120,13,fine_concrete,0,fine_concrete


In [9]:
train_df.to_csv("out.csv")