In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

from warnings import filterwarnings
filterwarnings("ignore")

In [23]:
tracks_dataset = pd.read_csv("Group_18_data_cleaned.csv") #read the excel file 
tracks_dataset.head()

Unnamed: 0,oid,timestamp,x,y,body_roll,body_pitch,body_yaw,head_roll,head_pitch,head_yaw,other_oid,other_class,other_x,other_y
0,50187,1842.4,495854.6403,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[47646, 50181, 50184, 50187]","[0, 4, 4, 4]","[495923.373133135, 495899.069769386, 495899.05...","[5405744.32136751, 5405738.47595118, 5405739.1..."
1,50187,1842.5,495854.7921,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[50181, 50187, 50184, 47646]","[4, 4, 4, 0]","[495899.234566716, 495854.792078353, 495899.22...","[5405738.39126416, 5405750.93930797, 5405739.2..."
2,50187,1842.6,495854.9438,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[47646, 50187, 50184, 50181]","[0, 4, 4, 4]","[495921.779445452, 495854.943847121, 495899.35...","[5405744.51929698, 5405750.96626812, 5405739.1..."
3,50187,1842.7,495855.0956,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[50187, 47646, 50184, 50181]","[4, 0, 4, 4]","[495855.09561589, 495920.943052671, 495899.490...","[5405750.99322827, 5405744.63008031, 5405739.1..."
4,50187,1842.8,495855.2569,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[50187, 50184, 50181, 47646]","[4, 4, 4, 0]","[495855.256935427, 495899.585908147, 495899.72...","[5405751.02150176, 5405739.0332702, 5405738.08..."


In [24]:
tracks_dataset.shape

(4759, 14)

In [25]:
X = tracks_dataset[['body_roll','body_pitch','body_yaw','head_roll','head_pitch','head_yaw']]
X

Unnamed: 0,body_roll,body_pitch,body_yaw,head_roll,head_pitch,head_yaw
0,0.3,-0.71,190.56,-0.25,-1.09,186.37
1,0.3,-0.71,190.56,-0.25,-1.09,186.37
2,0.3,-0.71,190.56,-0.25,-1.09,186.37
3,0.3,-0.71,190.56,-0.25,-1.09,186.37
4,0.3,-0.71,190.56,-0.25,-1.09,186.37
...,...,...,...,...,...,...
4754,0.3,-0.71,190.56,-0.25,-1.09,186.37
4755,0.3,-0.71,190.56,-0.25,-1.09,186.37
4756,0.3,-0.71,190.56,-0.25,-1.09,186.37
4757,0.3,-0.71,190.56,-0.25,-1.09,186.37


In [26]:
# MinMaxScaler
min_max_scaler = MinMaxScaler()

X_scaled = pd.DataFrame(min_max_scaler.fit_transform(X))

# While scaling the columns names are removed, so resigining them back
X_scaled.columns = X.columns
X_scaled

Unnamed: 0,body_roll,body_pitch,body_yaw,head_roll,head_pitch,head_yaw
0,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
1,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
2,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
3,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
4,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
...,...,...,...,...,...,...
4754,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
4755,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
4756,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761
4757,0.410112,0.690168,0.532291,0.593075,0.540117,0.518761


In [27]:
y_scaled = X_scaled['head_yaw']
y_scaled

0       0.518761
1       0.518761
2       0.518761
3       0.518761
4       0.518761
          ...   
4754    0.518761
4755    0.518761
4756    0.518761
4757    0.518761
4758    0.518761
Name: head_yaw, Length: 4759, dtype: float64

In [28]:
X_scaled = X_scaled.drop(['head_yaw'],axis=1)

In [29]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=38)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=38)

In [30]:
print(X_train.shape, X_valid.shape, X_test.shape)

(2331, 5) (1000, 5) (1428, 5)


### Linear Regression Model

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict

lm = LinearRegression()

print('Linear Regression with CV')
p = cross_val_predict(lm ,X_valid, y_valid, cv = 3)

print('MAE on the validation data:', np.round(mean_absolute_error(y_valid, p),4))
print('MSE on the validation data:', np.round(mean_squared_error(y_valid,  p),4))
print('RMSE on the validation data:', np.round(np.sqrt(mean_squared_error(y_valid,p)),4))

Linear Regression with CV
MAE on the validation data: 0.0403
MSE on the validation data: 0.0085
RMSE on the validation data: 0.0924


### KNeighborsRegressor Model

In [32]:
import pickle
from sklearn.neighbors import KNeighborsRegressor

pickle_in = open("knn_model.pkl","rb")

knn= pickle.load(pickle_in)
knn = KNeighborsRegressor(n_neighbors= 5, p= 2, weights= 'distance')
knn.fit(X_train,y_train)

pred=knn.predict(X_valid)

print('MAE on the validation data:', np.round(mean_absolute_error(y_valid, pred),4))
print('MSE on the validation data:', np.round(mean_squared_error(y_valid,  pred),4))
print('RMSE on the validation data:', np.round(np.sqrt(mean_squared_error(y_valid,pred)),4))

MAE on the validation data: 0.0314
MSE on the validation data: 0.0064
RMSE on the validation data: 0.08


###  Random Forest Regressor Model

In [33]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(bootstrap= True,max_depth= 90,max_features= 3,min_samples_leaf= 3,min_samples_split= 8,
 n_estimators= 200)

rf.fit(X_train, y_train)

pred=rf.predict(X_valid)

print('MAE on the validation data:', np.round(mean_absolute_error(y_valid,pred),4))
print('MSE on the validation data:', np.round(mean_squared_error(y_valid,pred),4))
print('RMSE on the validation data:', np.round(np.sqrt(mean_squared_error(y_valid,pred)),4))

MAE on the validation data: 0.0301
MSE on the validation data: 0.0062
RMSE on the validation data: 0.0786


### SVM Linear Model

In [34]:
from sklearn.svm import SVR

svr_linear = SVR(kernel='linear', C=1.0, epsilon=0.1)
svr_linear.fit(X_train, y_train) 

pred = svr_linear.predict(X_valid) 

print('MAE on the validation data:', np.round(mean_absolute_error(y_valid,pred),4))
print('MSE on the validation data:', np.round(mean_squared_error(y_valid,pred),4))
print('RMSE on the validation data:', np.round(np.sqrt(mean_squared_error(y_valid,pred)),4))

MAE on the validation data: 0.0384
MSE on the validation data: 0.0085
RMSE on the validation data: 0.0922


### SVM Non Linear Model

In [35]:
svr_tuned = SVR(kernel="rbf",
                C=0.2).fit(X_train,y_train)

pred = svr_tuned.predict(X_valid) 

print('MAE on the validation data:', np.round(mean_absolute_error(y_valid,pred),4))
print('MSE on the validation data:', np.round(mean_squared_error(y_valid,pred),4))
print('RMSE on the validation data:', np.round(np.sqrt(mean_squared_error(y_valid,pred)),4))

MAE on the validation data: 0.036
MSE on the validation data: 0.0066
RMSE on the validation data: 0.0813


### Using LASSO for feature selection

In [37]:
pickle_in = open("lasso_feature_model.pkl","rb")

lasso_mse= pickle.load(pickle_in)

print('MSE on the validation data: using Random Forest ', lasso_mse[0])
print('MSE on the validation data: using SVM non linear ', lasso_mse[1])

MSE on the validation data: using Random Forest  0.0057
MSE on the validation data: using SVM non linear  0.013


### Using Bidirectional for feature selection

In [41]:
pickle_in = open("bidirectional_feature_model.pkl","rb")

bidirectional_mse= pickle.load(pickle_in)

print('MSE on the validation data: using Random Forest ', bidirectional_mse[0])
print('MSE on the validation data: using SVM non linear ', bidirectional_mse[1])

MSE on the validation data: using Random Forest  0.0058
MSE on the validation data: using SVM non linear  0.0058


On comparing all the MSE with the models got the best results for Random Forest Model with LASSO feature technique which is 0.0056. Now performing the predictions on the test data.

In [17]:
pickle_in = open("best_features_LAASO.pkl","rb")

best_features= pickle.load(pickle_in)
best_features = best_features.to_numpy()
best_features

array(['body_pitch', 'body_yaw', 'head_roll', 'head_pitch'], dtype=object)

In [18]:
a= best_features[0]
b= best_features[1]
c= best_features[2]
d= best_features[3]
X_selected_train = X_train[[a,b,c,d]]
X_selected_test = X_test[[a,b,c,d]]
X_selected_test.shape

(1428, 4)

In [19]:
rf.fit(X_selected_train,y_train)

RandomForestRegressor(max_depth=90, max_features=3, min_samples_leaf=3,
                      min_samples_split=8, n_estimators=200)

In [20]:
pred=rf.predict(X_selected_test)
pred

array([0.64890939, 0.51876363, 0.51876363, ..., 0.51876363, 0.30700558,
       0.51876363])

In [21]:
print('MAE on the test data:', np.round(mean_absolute_error(y_test,pred),4))
print('MSE on the test data:', np.round(mean_squared_error(y_test,pred),4))
print('RMSE on the test data:', np.round(np.sqrt(mean_squared_error(y_test,pred)),4))

MAE on the test data: 0.0289
MSE on the test data: 0.0053
RMSE on the test data: 0.073
