In [None]:
# Importing few libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
#Importing dataset

data = pd.read_csv("../input/posture-reconstruction/ConfLongDemo_JSI.csv", header=None)
data.shape

In [None]:
data.head() #Let;'s see how data look's like

In [None]:
#Changing the names of columns according to given data description

data.columns = ['Sequence_Name', 'Tag_identificator', 'timestamp', 'date_FORMAT', 
                'x_coordinate', 'y_coordinate', 'z_coordinate', 'activity']

In [None]:
data.head()

In [None]:
data.info() #Lets see if their is any Null Values and Dtype

In [None]:
data.describe(include='all').T # It describes the data acc. to Mean, Median, percentile etc and Unique values

So we have 25 Unique Sequence Name

4 Unique Tag_identifier

11 Unique Activities

In [None]:
cat = data.select_dtypes(include='object').columns.to_list() # Extracting all categorical values to list
cat.remove("date_FORMAT") # Removing Date 
for i in cat:
    print("Name of {} col".format(i)) # Name of Column
    print("No. of NUnique", data[i].nunique()) # Total Nunique Values
    print("Unique Values", data[i].unique()) # All unique values
    print('*'*30)
    print()
    print()

In [None]:
def encoding(df):
    '''
    Encoding all Categorical Values to Label
    '''
    from sklearn.preprocessing import LabelEncoder

    tag_encoder = LabelEncoder()
    sequence_encoder =  LabelEncoder()
    activity_encoder = LabelEncoder()

    df['Tag_identificator'] = tag_encoder.fit_transform(df['Tag_identificator'])
    df['Sequence_Name'] = sequence_encoder.fit_transform(df['Sequence_Name'])
    df['activity'] = activity_encoder.fit_transform(df['activity'])
    return "Successful"

In [None]:
for i in data.select_dtypes(include=['int64', 'float64']):
    sns.boxplot(data[i]) #Boxlot for all Numerical Values to check how well data is distributed
    plt.show()

In [None]:
for i in data.select_dtypes(include=['int64', 'float64']):
    sns.distplot(data[i]) # Distribution plot to check how data is distributed
    plt.show()

In [None]:
fig = plt.figure()
ax = plt.axes(projection='3d') # 3D plot
ax.scatter3D(data['x_coordinate'],
         data['y_coordinate'],
         data['z_coordinate'],
         c = data['z_coordinate'], cmap='Greens')

In [None]:
def new_col(df):
    '''
    Creating new columns of day, month, year, hour, minute, second, microsecond from date_time Column
    
    It is created to check how our data is calculated and what is represent
    '''
    from datetime import datetime as dt # Importing Datetime library
    
    # transforming Datetime Column to Date_time format
    df['date_FORMAT'] = pd.to_datetime(df['date_FORMAT'], format="%d.%m.%Y %H:%M:%S:%f")
    
    # Extracting day, month, year, hour, minute, second, microsecond from Date_time Column
    df['day'] = df['date_FORMAT'].dt.day
    df['month'] = df['date_FORMAT'].dt.month
    df['year'] = df['date_FORMAT'].dt.year
    df['hour'] = df['date_FORMAT'].dt.hour
    df['minute'] = df['date_FORMAT'].dt.minute
    df['second'] = df['date_FORMAT'].dt.second
    df['microsecond'] = df['date_FORMAT'].dt.microsecond
    
    del df['date_FORMAT'] # Removing Date_time Column
    return 'Successfull'

In [None]:
data.head()

In [None]:
#Trasnforming our data

encoding(data)
new_col(data)

In [None]:
data.head()

In [None]:
#Correlation Graph

plt.figure(figsize=(20,12))
sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
data.describe(include='all').T

In [None]:
col = ['day', 'month', 'year',
       'hour', 'minute', 'second', 'microsecond']

for i in col:
    print("Name of {} col".format(i)) # Name of Column
    print("No. of NUnique", data[i].nunique()) # Total N_Unique Values in Column
    print("Unique Values", data[i].unique()) # All Unique values in column
    print('*'*30)
    print()
    print()

# After Extracting We have found that the data is of 5 Hours from 11 to 15 and It is collected on 27th May 2009

As all data is of same date we don't need Columns naming Day, Month and Year

In [None]:
data.drop(['day', 'month', 'year'], axis=1, inplace=True)

In [None]:
col = ['hour', 'minute', 'second']

for i in col:
    print("Name of {} col".format(i)) # Name of Column
    print("No. of NUnique", data[i].nunique()) # Total N_Unique Values in Column
    print("Unique Values", data[i].unique()) # All Unique values in column
    print('*'*30)
    print()
    print()

In [None]:
for i in col:
    sns.distplot(data[i]) # Distribution plot to check how data is distributed
    plt.show()

In [None]:
data.shape # Shape of data

In [None]:
data.info()

In [None]:
#for Spliting Data and Hyperparameter Tuning 
from sklearn.model_selection import train_test_split, GridSearchCV

#Importing Machine Learning Model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from catboost import CatBoostClassifier
    
#Bagging Algo
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

from sklearn.neural_network import MLPClassifier

#To tranform data
from sklearn import preprocessing

#statistical Tools
from sklearn.metrics import roc_auc_score,accuracy_score,precision_score,recall_score,f1_score

from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

In [None]:
data['activity'].value_counts() # Count of all Activity

In [None]:
X = data.drop(['activity'], axis=1) # Input Variable
y = data['activity'] # Target Varibale

In [None]:
from imblearn.over_sampling import SMOTE # Library to Balance Dataset
smote = SMOTE()

X_tf,y_tf = smote.fit_resample(X,y) # Balancing our Data
X_tf.shape, y_tf.shape # Checking our new shape after Over_Sampling

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x = scaler.fit_transform(X_tf)

In [None]:
# Split the data into training and testing sets 
x_train,x_test,y_train,y_test = train_test_split(x, y_tf, test_size=.1)

print(x_train.shape[0], x_test.shape[0])

In [None]:
accuracy = {}

def train_model(model, model_name):
    print(model_name)
    
    # Fitting model
    model = model.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    #Model accuracy
    acc = accuracy_score(y_test, pred)*100
    accuracy[model_name] = acc
    print('accuracy_score',acc)
    print()
    
    # Classification Report
    print('Classification Report')
    print(classification_report(y_test, pred))

In [None]:
lgbm = LGBMClassifier(n_estimators=720, n_jobs=-1, max_depth=15, min_child_weight=5, 
                      min_child_samples=5, num_leaves=10, learning_rate=0.15)

train_model(lgbm, 'LGBMClassifier')

In [None]:
cat = CatBoostClassifier(verbose = 0, n_estimators = 1000)

train_model(cat, "Cat Boost")

In [None]:
xgb = XGBClassifier(n_estimators = 1500, nthread  = 4, max_depth = 15, min_child_weight = 5, learning_rate=0.01)

train_model(xgb, 'XGBClassifier')

In [None]:
rfc = RandomForestClassifier(n_estimators = 1500, n_jobs=-1, max_depth=15, 
                             min_samples_split=5, min_samples_leaf=3)

train_model(rfc, 'Random Forest Classifier')

In [None]:
dtc = DecisionTreeClassifier(criterion='gini', splitter='random', max_depth=25, min_samples_split=4,
                            min_samples_leaf=2)

train_model(dtc, 'Decision Tree Classifier')

## XGB Classifier is giving the Best Result