In [159]:
import pandas as pd
import os
import glob
from pathlib import Path
import numpy as np
import scipy.stats as stats

In [233]:
path = r'C:/Users/hp/Documents/Homework 3 Data/AReM/' # Path has to be set


files = [os.path.join(dir,file) for dir, dir_name, file_list in os.walk(path) for file in file_list if file.endswith('.csv') ]

In [172]:
df_activity = pd.DataFrame() # A data frame storing time domains features of various time series activities at different instances

In [173]:
df_activity

In [193]:
test_data_files = [f for f in files if 'test data' in f] # grouping test data together

In [195]:
train_data_files = [f for f in files if 'train data' in f] # grouping train data together

# Extracting Time Domain Features for Training Set

In [196]:
train_data_files

['C:/Users/hp/Documents/Homework 3 Data/AReM/bending1\\train data\\dataset3.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending1\\train data\\dataset4.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending1\\train data\\dataset5.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending1\\train data\\dataset6.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending1\\train data\\dataset7.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending2\\train data\\dataset3.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending2\\train data\\dataset4.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending2\\train data\\dataset5.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/bending2\\train data\\dataset6.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/cycling\\train data\\dataset10.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/cycling\\train data\\dataset11.csv',
 'C:/Users/hp/Documents/Homework 3 Data/AReM/cycling\\train data\\dataset12.csv',
 'C:/Users/hp/Do

# Extracting time domain features

In [292]:
def time_domain_feature(instances):
    '''A function that returns time domain features for a time series in the form of a dataframe'''
    df_activity = pd.DataFrame()
    
    for i in range (len(instances)):
        data = pd.read_csv(instances[i],skiprows=4, usecols = list(range(1,8))) # reading csv files individually

        for j,col in enumerate(data.columns[1:7]):
            df_activity.loc[i,'min'+'_'+str(j+1)] = data.iloc[:,j].min() # extracting minimum value
            df_activity.loc[i,'max'+'_'+str(j+1)]  = data.iloc[:,j].max() # maximum value
            df_activity.loc[i,'mean'+'_'+str(j+1)]  = data.iloc[:,j].mean() # mean
            df_activity.loc[i,'median'+'_'+str(j+1)]  = data.iloc[:,j].median() # median
            df_activity.loc[i,'std' +'_'+str(j+1)]  = data.iloc[:,j].std() # standard deviation
            df_activity.loc[i,'1st quartile'+'_'+str(j+1)] = data.iloc[:,j].quantile(0.25) #q1
            df_activity.loc[i,'3rd quartile'+'_'+str(j+1)] = data.iloc[:,j].quantile(0.75) #q3
        df_activity.loc[i,'Activity'] = data['Activity'][0]
           
    return df_activity






        
       

In [302]:
df_train = time_domain_feature(train_data_files)

In [303]:
df_train.shape

(69, 43)

In [304]:
df_test = time_domain_feature(test_data_files)

In [305]:
df_test.shape

(19, 43)

In [306]:
df_test.head()

Unnamed: 0,min_1,max_1,mean_1,median_1,std_1,1st quartile_1,3rd quartile_1,min_2,max_2,mean_2,...,1st quartile_5,3rd quartile_5,min_6,max_6,mean_6,median_6,std_6,1st quartile_6,3rd quartile_6,Activity
0,37.25,45.0,40.624792,40.5,1.476967,39.25,42.0,0.0,1.3,0.358604,...,33.0,36.0,0.0,1.92,0.570583,0.43,0.582915,0.0,1.3,Bending1
1,38.0,45.67,42.812812,42.5,1.43555,42.0,43.67,0.0,1.22,0.372438,...,32.0,34.5,0.0,3.11,0.571083,0.43,0.60101,0.0,1.3,Bending1
2,12.75,51.0,24.562958,24.25,3.737514,23.1875,26.5,0.0,6.87,0.590833,...,20.5,27.0,0.0,4.97,0.700188,0.5,0.69372,0.43,0.87,Bending2
3,0.0,42.75,27.464604,28.0,3.583582,25.5,30.0,0.0,7.76,0.449708,...,15.0,20.75,0.0,6.76,1.122125,0.83,1.012342,0.47,1.3,Bending2
4,24.25,45.0,37.177042,36.25,3.581301,34.5,40.25,0.0,8.58,2.374208,...,17.95,21.75,0.0,9.34,2.921729,2.5,1.8526,1.5,3.9,Cycling


In [314]:
df_all = pd.concat([df_train,df_test],axis =0)

In [315]:
df_all.shape

(88, 43)

In [316]:
df_all.index+=1 # Making sure indexes start from 1

In [317]:
df_all.reset_index(inplace =True)


In [318]:
df_all.rename(columns ={"index":"Instance"},inplace=True) # Creating Instance column as specified

In [319]:
df_all.head()

Unnamed: 0,Instance,min_1,max_1,mean_1,median_1,std_1,1st quartile_1,3rd quartile_1,min_2,max_2,...,1st quartile_5,3rd quartile_5,min_6,max_6,mean_6,median_6,std_6,1st quartile_6,3rd quartile_6,Activity
0,1,35.0,47.4,43.9545,44.33,1.558835,43.0,45.0,0.0,1.7,...,35.3625,36.5,0.0,1.79,0.493292,0.43,0.513506,0.0,0.94,Bending1
1,2,33.0,47.75,42.179813,43.5,3.670666,39.15,45.0,0.0,3.0,...,30.4575,36.33,0.0,2.18,0.613521,0.5,0.524317,0.0,1.0,Bending1
2,3,33.0,45.75,41.678063,41.75,2.24349,41.33,42.75,0.0,2.83,...,28.4575,31.25,0.0,1.79,0.383292,0.43,0.389164,0.0,0.5,Bending1
3,4,37.0,48.0,43.454958,43.25,1.386098,42.5,45.0,0.0,1.58,...,22.25,24.0,0.0,5.26,0.679646,0.5,0.622534,0.43,0.87,Bending1
4,5,36.25,48.0,43.969125,44.5,1.618364,43.31,44.67,0.0,1.5,...,20.5,23.75,0.0,2.96,0.555313,0.49,0.487826,0.0,0.83,Bending1


# ESTIMATING STANDARD DEVIATION FOR TIME DOMAIN FEATURES AND ITS 90% CONFIDENCE INTERVAL

In [322]:

def estimated_standard_deviation(feature):
    ''' A function which bootstraps a time domain feature in the dataset, estimates it's standard deviation and returns the 90%
     confidence interval of the estimated value
    '''
    est_std_deviation =[] # list to store estimated standard deviation for the time domain feature 1000 times
    
    for i in range(1000):
        np.random.seed(42) # setting seed
        sample = np.random.choice(df_all[feature],size =len(df_all[feature])) # resampling in order to create a bootstrap of sample size 1000
        standard_deviation = np.std(sample) # calculating it's standard deviation
        est_std_deviation.append (standard_deviation) # appending the estimated value to a list 
    std_deviation = np.mean(est_std_deviation) # in order to calculate the estimated standard deviation of the population we take the average of all values in the list
    bootstraped_ci =stats.scoreatpercentile(est_std_deviation,[5,95]) # calculating 90% confidence interval of estimated SD
    return bootstraped_ci,std_deviation



In [328]:
standard_deviation = [] # list to store estimated standard deviation for different time domain features in the dataset
confidence_interval = [] # list to store 90% CI of each estimated value
for features in df_all.columns[1:-1]: # iterating through the required features
    (bootstraped_ci,std_deviation) = estimated_standard_deviation(features) # invoking the function to calculate estimated SD and CI
    standard_deviation.append((features,std_deviation)) 
    confidence_interval.append(bootstraped_ci)

In [329]:
standard_deviation 

[('min_1', 9.027495991929188),
 ('max_1', 3.6628076197216406),
 ('mean_1', 5.220156454429949),
 ('median_1', 5.506726983249914),
 ('std_1', 1.6335086072282619),
 ('1st quartile_1', 6.1187536531616065),
 ('3rd quartile_1', 4.941631168749297),
 ('min_2', 0.0),
 ('max_2', 4.982491606721853),
 ('mean_2', 1.638574125349916),
 ('median_2', 1.4721746648416596),
 ('std_2', 0.8915932008987454),
 ('1st quartile_2', 1.0078868830260572),
 ('3rd quartile_2', 2.203743570296858),
 ('min_3', 2.9097059275312924),
 ('max_3', 4.119629837740277),
 ('mean_3', 3.385150222513438),
 ('median_3', 3.5088126099421526),
 ('std_3', 0.8952053375646738),
 ('1st quartile_3', 3.5526399871221366),
 ('3rd quartile_3', 3.5384235671944086),
 ('min_4', 0.0),
 ('max_4', 2.0274376684582616),
 ('mean_4', 1.1677848414812682),
 ('median_4', 1.1619389030679756),
 ('std_4', 0.41209332828370226),
 ('1st quartile_4', 0.8616641862169304),
 ('3rd quartile_4', 1.5594099517439732),
 ('min_5', 5.967262676531273),
 ('max_5', 5.9368259904

In [330]:
standard_deviation_values = [x[1] for x in standard_deviation] # storing the values of estimated standard deviation in a list
standard_deviation_index = [x[0] for x in standard_deviation]  # stroring the feature names

# Creating a DataFrame to display the estimated SD and corresponding confidence interval

In [331]:
df_estimated_deviation = pd.DataFrame(list(zip(standard_deviation_values, confidence_interval)),index = standard_deviation_index , columns =['Standard Deviation','90%CI'])

In [332]:
df_estimated_deviation

Unnamed: 0,Standard Deviation,90%CI
min_1,9.027496,"[9.027495991929188, 9.027495991929188]"
max_1,3.662808,"[3.66280761972164, 3.66280761972164]"
mean_1,5.220156,"[5.220156454429948, 5.220156454429948]"
median_1,5.506727,"[5.506726983249913, 5.506726983249913]"
std_1,1.633509,"[1.6335086072282623, 1.6335086072282623]"
1st quartile_1,6.118754,"[6.1187536531616065, 6.1187536531616065]"
3rd quartile_1,4.941631,"[4.941631168749297, 4.941631168749297]"
min_2,0.0,"[0.0, 0.0]"
max_2,4.982492,"[4.982491606721854, 4.982491606721854]"
mean_2,1.638574,"[1.638574125349916, 1.638574125349916]"


# Selecting the 3 most important features from DataFrame

# Selecting Min, Mean and Max

In [333]:
imp_features = ['min','mean','max']

# Selecting the important features from train data

In [334]:
df_train.head()

Unnamed: 0,min_1,max_1,mean_1,median_1,std_1,1st quartile_1,3rd quartile_1,min_2,max_2,mean_2,...,1st quartile_5,3rd quartile_5,min_6,max_6,mean_6,median_6,std_6,1st quartile_6,3rd quartile_6,Activity
0,35.0,47.4,43.9545,44.33,1.558835,43.0,45.0,0.0,1.7,0.42625,...,35.3625,36.5,0.0,1.79,0.493292,0.43,0.513506,0.0,0.94,Bending1
1,33.0,47.75,42.179813,43.5,3.670666,39.15,45.0,0.0,3.0,0.696042,...,30.4575,36.33,0.0,2.18,0.613521,0.5,0.524317,0.0,1.0,Bending1
2,33.0,45.75,41.678063,41.75,2.24349,41.33,42.75,0.0,2.83,0.535979,...,28.4575,31.25,0.0,1.79,0.383292,0.43,0.389164,0.0,0.5,Bending1
3,37.0,48.0,43.454958,43.25,1.386098,42.5,45.0,0.0,1.58,0.378083,...,22.25,24.0,0.0,5.26,0.679646,0.5,0.622534,0.43,0.87,Bending1
4,36.25,48.0,43.969125,44.5,1.618364,43.31,44.67,0.0,1.5,0.413125,...,20.5,23.75,0.0,2.96,0.555313,0.49,0.487826,0.0,0.83,Bending1


In [368]:
imp_cols= [cols for cols in df_train.columns.values if any (imp in cols for imp in imp_features)]

In [369]:
imp_cols

['min_1',
 'max_1',
 'mean_1',
 'min_2',
 'max_2',
 'mean_2',
 'min_3',
 'max_3',
 'mean_3',
 'min_4',
 'max_4',
 'mean_4',
 'min_5',
 'max_5',
 'mean_5',
 'min_6',
 'max_6',
 'mean_6']

In [370]:
imp_cols.append('Activity')

In [371]:
df_train_imp = df_train[imp_cols]

In [372]:
df_train_imp.head()

Unnamed: 0,min_1,max_1,mean_1,min_2,max_2,mean_2,min_3,max_3,mean_3,min_4,max_4,mean_4,min_5,max_5,mean_5,min_6,max_6,mean_6,Activity
0,35.0,47.4,43.9545,0.0,1.7,0.42625,6.5,29.75,22.122354,0.0,4.44,0.497313,29.0,38.5,35.588458,0.0,1.79,0.493292,Bending1
1,33.0,47.75,42.179813,0.0,3.0,0.696042,8.5,30.0,22.183625,0.0,5.15,0.989917,20.0,38.67,33.493917,0.0,2.18,0.613521,Bending1
2,33.0,45.75,41.678063,0.0,2.83,0.535979,3.0,28.25,19.006562,0.0,6.42,0.841875,23.67,37.5,29.857083,0.0,1.79,0.383292,Bending1
3,37.0,48.0,43.454958,0.0,1.58,0.378083,5.75,27.0,15.793333,0.0,10.03,0.849354,8.0,33.5,23.034792,0.0,5.26,0.679646,Bending1
4,36.25,48.0,43.969125,0.0,1.5,0.413125,1.5,26.33,15.868021,0.0,5.17,0.666354,11.33,30.75,22.10375,0.0,2.96,0.555313,Bending1


In [375]:
df_train_imp['Activity'] =  df_train_imp['Activity'].apply(lambda x : 1  if x=='Bending1' or x =='Bending2' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [376]:
df_train_imp.head()

Unnamed: 0,min_1,max_1,mean_1,min_2,max_2,mean_2,min_3,max_3,mean_3,min_4,max_4,mean_4,min_5,max_5,mean_5,min_6,max_6,mean_6,Activity
0,35.0,47.4,43.9545,0.0,1.7,0.42625,6.5,29.75,22.122354,0.0,4.44,0.497313,29.0,38.5,35.588458,0.0,1.79,0.493292,1
1,33.0,47.75,42.179813,0.0,3.0,0.696042,8.5,30.0,22.183625,0.0,5.15,0.989917,20.0,38.67,33.493917,0.0,2.18,0.613521,1
2,33.0,45.75,41.678063,0.0,2.83,0.535979,3.0,28.25,19.006562,0.0,6.42,0.841875,23.67,37.5,29.857083,0.0,1.79,0.383292,1
3,37.0,48.0,43.454958,0.0,1.58,0.378083,5.75,27.0,15.793333,0.0,10.03,0.849354,8.0,33.5,23.034792,0.0,5.26,0.679646,1
4,36.25,48.0,43.969125,0.0,1.5,0.413125,1.5,26.33,15.868021,0.0,5.17,0.666354,11.33,30.75,22.10375,0.0,2.96,0.555313,1


# 