# Importing data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
 
%matplotlib inline

In [None]:
df = pd.read_csv('../input/mobile-health/mhealth_raw_data.csv')
df

In [None]:
df.info()

In [None]:
df.Activity.value_counts()

The data is highly inbalanced so resampling it.

In [None]:
from sklearn.utils import resample
 
df_majority = df[df.Activity==0]
df_minorities = df[df.Activity!=0]
 
df_majority_downsampled = resample(df_majority,n_samples=30000, random_state=42)
df = pd.concat([df_majority_downsampled, df_minorities])
df.Activity.value_counts()

In [None]:
df.isnull().sum()

No null values.

In [None]:
#Dropping duplicates
df = df.drop(df[df.duplicated(keep = 'first')].index, axis=0)

In [None]:
df.Activity.value_counts()

In [None]:
label_map = {
    0: 'Nothing',
    1: 'Standing still',  
    2: 'Sitting and relaxing', 
    3: 'Lying down',  
    4: 'Walking',  
    5: 'Climbing stairs',  
    6: 'Waist bends forward',
    7: 'Frontal elevation of arms', 
    8: 'Knees bending (crouching)', 
    9: 'Cycling', 
    10: 'Jogging', 
    11: 'Running', 
    12: 'Jump front & back' 
}

In [None]:
#Defining functions to visualize comparisons.
def plot_comparison(data, metric = 'acceleration'):
 
  metric = metric[0].lower()
  data = data
 
  for i in range(0,13):
    plt.figure(figsize=(16,4))
 
    plt.subplot(1,2,1)
    plt.plot(data[ data['Activity']==i ].reset_index(drop=True)[metric+'lx'], alpha=.7, label=metric+'lx')
    plt.plot(data[ data['Activity']==i ].reset_index(drop=True)[metric+'ly'],color='red', alpha=.7, label=metric+'ly')
    plt.plot(data[ data['Activity']==i ].reset_index(drop=True)[metric+'lz'],color='green', alpha=.7, label=metric+'lz')
    plt.title(f'{label_map[i]} - left-ankle')
    plt.legend()  
 
    plt.subplot(1,2,2)
    plt.plot(data[ data['Activity']==i ].reset_index(drop=True)[metric+'rx'], alpha=.7, label=metric+'rx')
    plt.plot(data[ data['Activity']==i ].reset_index(drop=True)[metric+'ry'],color='red', alpha=.7, label=metric+'ry')
    plt.plot(data[ data['Activity']==i ].reset_index(drop=True)[metric+'rz'],color='green', alpha=.7, label=metric+'rz')
    plt.title(f'{label_map[i]} - right-lower-arm')
    plt.legend() 
 
    plt.show()
    print()
 
def plot_category(data,cat):
  array = (data[cat].value_counts().sort_values(ascending=False)/len(data))*100
  plt.barh(array.index, width = array.values)
  for index, value in enumerate(array.values):
      plt.text(value + .5 , index, s= '{:.1f}%'.format(value))
  plt.show()

In [None]:
plot_category(df,'Activity')
plt.show()

Activity 12 has very less data points compare to rest of all.

In [None]:
plot_category(df,'subject')

All subjects contributes almost equally.

# EDA of 1 subject data
checking data for 1 subject only

In [None]:
subject1 = df[df['subject']=='subject1']
subject1.Activity.value_counts()

In [None]:
plot_category(subject1,'Activity')

Activity 12 has very less data points compare to rest of all as we have seen in full dataset.

In [None]:
plot_comparison(subject1,'acceleration')

Clearly visible that static activites like sitting and lying down can be seperated from dynamic like cyclic and jumping. 

In [None]:
plot_comparison(subject1,'gyroscope')

Gyroscope data is much more clear,stable and follows a particular frequency cycle.

In [None]:
plt.plot(subject1[subject1.Activity == 8].reset_index(drop=True).head(500)['alx'])
plt.plot(subject1[subject1.Activity == 8].reset_index(drop=True).head(500)['aly'], color='red')
plt.plot(subject1[subject1.Activity == 8].reset_index(drop=True).head(500)['alz'], color='green')

# EDA of Full Dataset

In [None]:
plot_comparison(df)

Again Static activites are more stable and  can be seperated from dynamic activities easily.

In [None]:
plot_comparison(df,'gyroscope')

In [None]:
plt.figure(figsize=(8,6))
facetgrid = sns.FacetGrid(subject1, hue='Activity', height=6, aspect=2)
facetgrid.map(sns.distplot,'gly', hist=False).add_legend()
plt.show()

Some activities are clearly seperated out from others.

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=df)
plt.show()

There are lots of outliers.

# Data Cleaning

In [None]:
df.describe().T

In [None]:
df1 = df.copy()

In [None]:
#Dropping feature have data outside 98% confidence interval
for feature in df1.columns[:-2]:
  lower_range = np.quantile(df[feature],0.01)
  upper_range = np.quantile(df[feature],0.99)
  print(feature,'range:',lower_range,'to',upper_range)

  df1 = df1.drop(df1[(df1[feature]>upper_range) | (df1[feature]<lower_range)].index, axis=0)
  print('shape',df1.shape)

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=df1)
plt.show()

Follow this ****[notebook](https://www.kaggle.com/gaurav2022/cnn-lstm-95)**** for Deep Learning modeling and predictions.

If you have learn something new, Kindly upvote to help community :)

You can follow this ****[github repo](https://github.com/G0rav/Human_Activity_Recognition)**** for future advancments. 