In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as error_metric
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import VarianceThreshold

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#Check for null values
train.isnull().values.any()

In [None]:
test.isnull().values.any()

In [None]:
train.drop('subject', axis =1, inplace=True)
test.drop('subject', axis =1, inplace=True)

In [None]:
train.head()

In [None]:
rem_cols2 = test.columns.tolist()

In [None]:
# We check the datatypes 
train.dtypes.value_counts()

In [None]:
test.dtypes.value_counts()

In [None]:
train.describe()  #we see that the min = -1 and the max = +1. so no need for scaling

In [None]:
train.dtypes.tail()

They have the same data types. That is, mostly floats and one object feature. Lets see what the object feature is abd extract it from the rest

In [None]:
object_feature = train.dtypes == np.object
object_feature = train.columns[object_feature]
object_feature

In [None]:
train.Activity.value_counts()

In [None]:
le = LabelEncoder()
for x in [train, test]:
    x['Activity'] = le.fit_transform(x.Activity)
    

In [None]:
train.Activity.sample(5)

In [None]:
test.Activity.sample(5)

In [None]:
feature_cols = train.columns[: -1]   #exclude the Activity column
#Calculate the correlation values
correlated_values = train[feature_cols].corr()
#stack the data and convert to a dataframe

correlated_values = (correlated_values.stack().to_frame().reset_index()
                    .rename(columns={'level_0': 'Feature_1', 'level_1': 'Feature_2', 0:'Correlations'}))
correlated_values.head()

In [None]:
#create an abs_correlation column
correlated_values['abs_correlation'] = correlated_values.Correlations.abs()
correlated_values.head()

In [None]:
#Picking most correlated features
train_fields = correlated_values.sort_values('Correlations', ascending = False).query('abs_correlation>0.8')
train_fields.sample(5)

## 4 - Splitting the data into train and validation 

In [None]:
#Getting the split indexes

split_data = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 42)
train_idx, val_idx = next(split_data.split(train[feature_cols], train.Activity))

#creating the dataframes

x_train = train.loc[train_idx, feature_cols]
y_train = train.loc[train_idx, 'Activity']

x_val = train.loc[val_idx, feature_cols]
y_val = train.loc[val_idx, 'Activity']

In [None]:
y_train.value_counts(normalize = True)

In [None]:
y_val.value_counts(normalize = True)

In [None]:
#Same ratio of classes in both the train and validation data thanks to StratifiedShuffleSPlit

## 5 - Predictive Modelling

In [None]:
lr = LogisticRegression()
lr_l2 = LogisticRegressionCV(Cs=10, cv=4, penalty='l2')
rf = RandomForestClassifier(n_estimators = 10)

svm = lr.fit(x_train, y_train)

knn = rf.fit(x_train, y_train)

hybrid = lr_l2.fit(x_train, y_train)


In [None]:
#predict the classes and probability  for each

y_predict = list()
y_proba = list()

labels = ['svm', 'knn', 'hybrid']
models = [lr, lr_l2, rf]

for lab, mod in zip(labels, models):
    y_predict.append(pd.Series(mod.predict(x_val), name = lab))
    y_proba.append(pd.Series(mod.predict_proba(x_val).max(axis=1), name = lab))
    #.max(axis = 1) for a 1 dimensional dataframe

y_predict = pd.concat(y_predict, axis = 1)
y_proba = pd.concat(y_proba, axis = 1)

y_predict.head()

In [None]:
y_proba.head(10)

## 6 - Calculating the Error Metrics

In [None]:
metrics = list()
confusion_m = dict()

for lab in labels:
    precision, recall, f_score, _ = error_metric(y_val, y_predict[lab], average = 'weighted')
    
    accuracy = accuracy_score(y_val, y_predict[lab])
    
    confusion_m[lab] = confusion_matrix(y_val, y_predict[lab])
    
    metrics.append(pd.Series({'Precision': precision, 'Recall': recall,
                            'F_score': f_score, 'Accuracy': accuracy}, name = lab))
    
metrics= pd.concat(metrics, axis =1) 

In [None]:
metrics

In [None]:
fig, axList = plt.subplots(nrows=2, ncols=2)
axList = axList.flatten()
fig.set_size_inches(12, 10)

axList[-1].axis('off')

for ax,lab in zip(axList[:-1], labels):
    sns.heatmap(confusion_m[lab], ax=ax, annot=True, fmt='d');
    ax.set(title=lab);
    
plt.tight_layout()

In [None]:
#Remeber>..
train_fields.sample(5)