In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/human-activity-recognition-with-smartphones/train.csv')
test=pd.read_csv('/kaggle/input/human-activity-recognition-with-smartphones/test.csv')

data=train
data.head()

**Data Preprocessing**

1.1 Check for missing values
1.2 Check for duplicates
1.3 Check for unique values

In [None]:
missingValues=data.isnull().sum()
print(missingValues[missingValues>0])

In [None]:
#Check Duplicates
print('Number of duplicates in train:',sum(train.duplicated()))
print('Number of duplicates in test:',sum(test.duplicated()))

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#Check for unique values
data.nunique()

In [None]:
data.Activity.unique()

In [None]:
#Check for Class Imbalance
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,7))
sns.countplot('Activity',data=data)

Almost same distribution so that is not imbalanced

Exploratory Data Analysis
Based on the common nature of activities we can broadly put them in two categories.

Static and dynamic activities :
* SITTING, STANDING, LAYING can be considered as static activities with no motion involved
* WALKING, WALKING_DOWNSTAIRS, WALKING_UPSTAIRS can be considered as dynamic activities with significant amount of motion involved

In [None]:
#plt.figure(figsize=(12,8))
g=sns.FacetGrid(train,hue='Activity',height=5,aspect=3)
g.map(sns.distplot,'tBodyAccMag-mean()').add_legend()

In [None]:
sns.distplot(a=data.subject,kde=False)

In [None]:
sns.scatterplot(x='subject',y='tBodyAccMag-mean()',hue='Activity',data=train)

In [None]:
plt.figure(figsize=(12,7))
sns.violinplot('Activity','angle(tBodyAccMean,gravity)',data=train,split=True)

In [None]:
sns.swarmplot('Activity','angle(tBodyAccMean,gravity)',data=train)

In [None]:
if(tBodyAccMag-mean()<=-0.5):
    Activity = "static"
else:
    Activity = "dynamic"

In [None]:
plt.figure(figsize=(12,8))
plt.subplot(1,2,1)

sns.distplot(train[train['Activity']=='SITTING']['tBodyAccMag-mean()'])
sns.distplot(train[train['Activity']=='STANDING']['tBodyAccMag-mean()'])
sns.distplot(train[train['Activity']=='LAYING']['tBodyAccMag-mean()'])

plt.subplot(1,2,2)
plt.title("Dynamic Activities(closer view)")
sns.distplot(train[train["Activity"]=="WALKING"]['tBodyAccMag-mean()'],hist = False, label = 'Walking')
sns.distplot(train[train["Activity"]=="WALKING_DOWNSTAIRS"]['tBodyAccMag-mean()'],hist = False,label = 'Downstairs')
sns.distplot(train[train["Activity"]=="WALKING_UPSTAIRS"]['tBodyAccMag-mean()'],hist = False, label = 'Upstairs')

In [None]:

plt.figure(figsize=(10,7))
sns.boxplot(x='Activity',y='tBodyAccMag-mean()',data=train)

In [None]:
if(tBodyAccMag-mean()<=-0.8):
    Activity = "static"
if(tBodyAccMag-mean()>=-0.6):
    Activity = "dynamic"


Visualising Data using t-SNE

Using t-SNE data can be visualized from a extremely high dimensional space to a low dimensional space and still it retains lots of actual information. Given training data has 561 unqiue features, using t-SNE let's visualize it to a 2D space.

In [None]:
from sklearn.manifold import TSNE

X_tsne=train.drop(['Activity','subject'],axis=1)

tsne=TSNE(random_state=42,n_components=2,verbose=1,perplexity=50,n_iter=1000).fit_transform(X_tsne)

In [None]:
train.head()

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x=tsne[:,0],y=tsne[:,1],hue=train['Activity'])

In [None]:
X_train = train.drop(['subject', 'Activity'], axis=1)
y_train = train.Activity
X_test = test.drop(['subject', 'Activity'], axis=1)
y_test = test.Activity
print('Training data size : ', X_train.shape)
print('Test data size : ', X_test.shape)

In [None]:
from sklearn. linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model=LogisticRegression()

model.fit(X_train,y_train)
yhat=model.predict(X_test)

lr_accuracy = accuracy_score(y_true=y_test, y_pred=yhat)
print("Accuracy using Logistic Regression : ", lr_accuracy)