In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

from sklearn.model_selection import GridSearchCV

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
tracks_dataset = pd.read_csv("Group_18_data_cleaned.csv") #read the excel file 
tracks_dataset.head()

Unnamed: 0,oid,timestamp,x,y,body_roll,body_pitch,body_yaw,head_roll,head_pitch,head_yaw,other_oid,other_class,other_x,other_y
0,50187,1842.4,495854.6403,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[47646, 50181, 50184, 50187]","[0, 4, 4, 4]","[495923.373133135, 495899.069769386, 495899.05...","[5405744.32136751, 5405738.47595118, 5405739.1..."
1,50187,1842.5,495854.7921,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[50181, 50187, 50184, 47646]","[4, 4, 4, 0]","[495899.234566716, 495854.792078353, 495899.22...","[5405738.39126416, 5405750.93930797, 5405739.2..."
2,50187,1842.6,495854.9438,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[47646, 50187, 50184, 50181]","[0, 4, 4, 4]","[495921.779445452, 495854.943847121, 495899.35...","[5405744.51929698, 5405750.96626812, 5405739.1..."
3,50187,1842.7,495855.0956,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[50187, 47646, 50184, 50181]","[4, 0, 4, 4]","[495855.09561589, 495920.943052671, 495899.490...","[5405750.99322827, 5405744.63008031, 5405739.1..."
4,50187,1842.8,495855.2569,5405751.0,0.3,-0.71,190.56,-0.25,-1.09,186.37,"[50187, 50184, 50181, 47646]","[4, 4, 4, 0]","[495855.256935427, 495899.585908147, 495899.72...","[5405751.02150176, 5405739.0332702, 5405738.08..."


In [3]:
tracks_dataset.shape

(4759, 14)

In [4]:
X = tracks_dataset[['body_roll','body_pitch','body_yaw','head_roll','head_pitch','head_yaw']]
X

Unnamed: 0,body_roll,body_pitch,body_yaw,head_roll,head_pitch,head_yaw
0,0.3,-0.71,190.56,-0.25,-1.09,186.37
1,0.3,-0.71,190.56,-0.25,-1.09,186.37
2,0.3,-0.71,190.56,-0.25,-1.09,186.37
3,0.3,-0.71,190.56,-0.25,-1.09,186.37
4,0.3,-0.71,190.56,-0.25,-1.09,186.37
...,...,...,...,...,...,...
4754,0.3,-0.71,190.56,-0.25,-1.09,186.37
4755,0.3,-0.71,190.56,-0.25,-1.09,186.37
4756,0.3,-0.71,190.56,-0.25,-1.09,186.37
4757,0.3,-0.71,190.56,-0.25,-1.09,186.37


In [5]:
X.corr()

Unnamed: 0,body_roll,body_pitch,body_yaw,head_roll,head_pitch,head_yaw
body_roll,1.0,-0.068315,0.071149,0.2454,0.026663,0.047139
body_pitch,-0.068315,1.0,-0.117957,0.058121,0.293689,-0.060023
body_yaw,0.071149,-0.117957,1.0,-0.055772,-0.073947,0.826301
head_roll,0.2454,0.058121,-0.055772,1.0,0.025732,0.031402
head_pitch,0.026663,0.293689,-0.073947,0.025732,1.0,-0.01539
head_yaw,0.047139,-0.060023,0.826301,0.031402,-0.01539,1.0


LASSO -  Lasso stands for Least Absolute Shrinkage and Selection Operator.
It is a type of linear regression that uses shrinkage.
What LASSO does well is to provide a principled way to reduce the number of features in a model. In contrast, automated feature selection based on standard linear regression by stepwise selection or choosing features with the lowest p-values has many drawbacks. Advantages of LASSO over other regression-based approaches are specifically described here. LASSO involves a penalty factor that determines how many features are retained; using cross-validation to choose the penalty factor helps assure that the model will generalize well to future data samples.

In [6]:
y = tracks_dataset['head_yaw']
y

0       186.37
1       186.37
2       186.37
3       186.37
4       186.37
         ...  
4754    186.37
4755    186.37
4756    186.37
4757    186.37
4758    186.37
Name: head_yaw, Length: 4759, dtype: float64

In [7]:
X = X.drop(['head_yaw'],axis=1)

In [8]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

In [9]:
print(X_train.shape, X_valid.shape, X_test.shape)

(2331, 5) (1000, 5) (1428, 5)


In [10]:
from sklearn.linear_model import Lasso

In [11]:
features = X.columns
features

Index(['body_roll', 'body_pitch', 'body_yaw', 'head_roll', 'head_pitch'], dtype='object')

In [12]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('model',Lasso())])

In [13]:
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=3)

In [14]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 1/5] END ...............model__alpha=0.1;, score=-795.311 total time=   0.0s
[CV 2/5] END ...............model__alpha=0.1;, score=-752.544 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.1;, score=-1311.174 total time=   0.0s
[CV 4/5] END ...............model__alpha=0.1;, score=-819.622 total time=   0.0s
[CV 5/5] END ...............model__alpha=0.1;, score=-378.661 total time=   0.0s
[CV 1/5] END ...............model__alpha=0.2;, score=-794.879 total time=   0.0s
[CV 2/5] END ...............model__alpha=0.2;, score=-751.341 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.2;, score=-1309.559 total time=   0.0s
[CV 4/5] END ...............model__alpha=0.2;, score=-819.530 total time=   0.0s
[CV 5/5] END ...............model__alpha=0.2;, score=-378.101 total time=   0.0s
[CV 1/5] END model__alpha=0.30000000000000004;, score=-794.485 total time=   0.0s
[CV 2/5] END model__alpha=0.30000000000000004;

[CV 4/5] END model__alpha=2.3000000000000003;, score=-820.034 total time=   0.0s
[CV 5/5] END model__alpha=2.3000000000000003;, score=-375.853 total time=   0.0s
[CV 1/5] END model__alpha=2.4000000000000004;, score=-792.299 total time=   0.0s
[CV 2/5] END model__alpha=2.4000000000000004;, score=-753.772 total time=   0.0s
[CV 3/5] END model__alpha=2.4000000000000004;, score=-1294.933 total time=   0.0s
[CV 4/5] END model__alpha=2.4000000000000004;, score=-820.191 total time=   0.0s
[CV 5/5] END model__alpha=2.4000000000000004;, score=-375.935 total time=   0.0s
[CV 1/5] END model__alpha=2.5000000000000004;, score=-792.325 total time=   0.0s
[CV 2/5] END model__alpha=2.5000000000000004;, score=-753.992 total time=   0.0s
[CV 3/5] END model__alpha=2.5000000000000004;, score=-1294.851 total time=   0.0s
[CV 4/5] END model__alpha=2.5000000000000004;, score=-820.349 total time=   0.0s
[CV 5/5] END model__alpha=2.5000000000000004;, score=-376.023 total time=   0.0s
[CV 1/5] END .............

[CV 5/5] END ...............model__alpha=4.7;, score=-379.766 total time=   0.0s
[CV 1/5] END ...............model__alpha=4.8;, score=-794.380 total time=   0.0s
[CV 2/5] END ...............model__alpha=4.8;, score=-759.834 total time=   0.0s
[CV 3/5] END ..............model__alpha=4.8;, score=-1293.673 total time=   0.0s
[CV 4/5] END ...............model__alpha=4.8;, score=-824.847 total time=   0.0s
[CV 5/5] END ...............model__alpha=4.8;, score=-380.005 total time=   0.0s
[CV 1/5] END ...............model__alpha=4.9;, score=-794.529 total time=   0.0s
[CV 2/5] END ...............model__alpha=4.9;, score=-760.122 total time=   0.0s
[CV 3/5] END ..............model__alpha=4.9;, score=-1293.653 total time=   0.0s
[CV 4/5] END ...............model__alpha=4.9;, score=-825.090 total time=   0.0s
[CV 5/5] END ...............model__alpha=4.9;, score=-380.248 total time=   0.0s
[CV 1/5] END ...............model__alpha=5.0;, score=-794.682 total time=   0.0s
[CV 2/5] END ...............

[CV 5/5] END ...............model__alpha=7.1;, score=-386.778 total time=   0.0s
[CV 1/5] END ...............model__alpha=7.2;, score=-799.316 total time=   0.0s
[CV 2/5] END ...............model__alpha=7.2;, score=-767.501 total time=   0.0s
[CV 3/5] END ..............model__alpha=7.2;, score=-1293.909 total time=   0.0s
[CV 4/5] END ...............model__alpha=7.2;, score=-831.739 total time=   0.0s
[CV 5/5] END ...............model__alpha=7.2;, score=-387.128 total time=   0.0s
[CV 1/5] END ...............model__alpha=7.3;, score=-799.583 total time=   0.0s
[CV 2/5] END ...............model__alpha=7.3;, score=-767.855 total time=   0.0s
[CV 3/5] END ..............model__alpha=7.3;, score=-1293.951 total time=   0.0s
[CV 4/5] END ...............model__alpha=7.3;, score=-832.075 total time=   0.0s
[CV 5/5] END ...............model__alpha=7.3;, score=-387.484 total time=   0.0s
[CV 1/5] END ...............model__alpha=7.4;, score=-799.856 total time=   0.0s
[CV 2/5] END ...............

[CV 5/5] END ...............model__alpha=9.2;, score=-395.116 total time=   0.0s
[CV 1/5] END ...............model__alpha=9.3;, score=-805.970 total time=   0.0s
[CV 2/5] END ...............model__alpha=9.3;, score=-775.525 total time=   0.0s
[CV 3/5] END ..............model__alpha=9.3;, score=-1295.342 total time=   0.0s
[CV 4/5] END ...............model__alpha=9.3;, score=-839.611 total time=   0.0s
[CV 5/5] END ...............model__alpha=9.3;, score=-395.564 total time=   0.0s
[CV 1/5] END ...............model__alpha=9.4;, score=-806.341 total time=   0.0s
[CV 2/5] END ...............model__alpha=9.4;, score=-775.938 total time=   0.0s
[CV 3/5] END ..............model__alpha=9.4;, score=-1295.439 total time=   0.0s
[CV 4/5] END ...............model__alpha=9.4;, score=-840.028 total time=   0.0s
[CV 5/5] END ...............model__alpha=9.4;, score=-396.017 total time=   0.0s
[CV 1/5] END ...............model__alpha=9.5;, score=-806.717 total time=   0.0s
[CV 2/5] END ...............

GridSearchCV(cv=5, estimator=Pipeline(steps=[('model', Lasso())]),
             param_grid={'model__alpha': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2,
       5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5,
       6.6, 6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8,
       7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9. , 9.1,
       9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9])},
             scoring='neg_mean_squared_error', verbose=3)

In [15]:
search.best_params_
# {'model__alpha': 1.2000000000000002}

{'model__alpha': 1.2000000000000002}

In [16]:
coefficients = search.best_estimator_.named_steps['model'].coef_

In [17]:
importance = np.abs(coefficients)
importance

array([0.        , 0.16921831, 0.81192259, 1.9229217 , 0.04531236])

In [18]:
np.array(features)[importance > 0.03]

array(['body_pitch', 'body_yaw', 'head_roll', 'head_pitch'], dtype=object)

In this way, we have used a properly optimized Lasso regression to get information about the most important features of our dataset according to the given target variable.

From the above correlation matrix also we can conclude that these three ['body_roll', 'body_yaw', 'head_roll'] are important features inorder to predict the target data.

Using the best model from project -2 (Random Forest Regressor) with the optimized features

In [19]:
X = tracks_dataset[['body_roll','body_pitch','body_yaw','head_roll','head_pitch','head_yaw']]
X

# MinMaxScaler
min_max_scaler = MinMaxScaler()

X_scaled = pd.DataFrame(min_max_scaler.fit_transform(X))

# While scaling the columns names are removed, so resigining them back
X_scaled.columns = X.columns
X_scaled

y_scaled = X_scaled['head_yaw']
X_scaled = X_scaled.drop(['head_yaw'],axis=1)

In [20]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=42)

In [21]:
X_selected_train2 = X_train[['body_pitch','body_yaw','head_roll','head_pitch']]
y_selected_train2 = y_train
y_train

2571    0.544675
2542    0.518761
4270    0.512164
2007    0.566720
1913    0.175917
          ...   
480     0.518761
115     0.351834
1496    0.518761
3776    0.518761
4097    0.324779
Name: head_yaw, Length: 2331, dtype: float64

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [23]:
X_selected_valid = X_valid[['body_pitch','body_yaw','head_roll','head_pitch']]
y_selected_valid = y_valid

In [24]:
rf = RandomForestRegressor(bootstrap= True,max_depth= 90,max_features= 3,min_samples_leaf= 3,min_samples_split= 8,
 n_estimators= 200)
rf.fit(X_selected_train2, y_selected_train2)

RandomForestRegressor(max_depth=90, max_features=3, min_samples_leaf=3,
                      min_samples_split=8, n_estimators=200)

In [25]:
pred = rf.predict(X_selected_valid)

In [26]:
print("RMSE for Random forest using LASSO variable selection:",np.round(mean_squared_error(y_selected_valid,rf.predict(X_selected_valid)),4))

RMSE for Random forest using LASSO variable selection: 0.0057


Using the best model from project -3 (SVM Non-Linear) with the optimized features

In [27]:
from sklearn.svm import SVR
SVR_Radial_Basis = SVR(kernel="rbf").fit(X_selected_train2,y_selected_train2)

svr_parameters = {"C": np.arange(0.2,5,0.1)}
svr_cv_model= GridSearchCV(SVR_Radial_Basis,svr_parameters,cv=5).fit(X_selected_train2,y_selected_train2)

In [28]:
svr_cv_model.best_params_

{'C': 0.2}

In [29]:
svr_cv_model.best_params_['C']

0.2

In [30]:
svr_tuned = SVR(kernel="rbf",
                C=svr_cv_model.best_params_['C']).fit(X_selected_train2,y_selected_train2)

In [31]:
y_pred=svr_tuned.predict(X_selected_train2)
y_pred

array([0.52955573, 0.51653144, 0.50504684, ..., 0.51653144, 0.51653144,
       0.28507125])

In [32]:
from sklearn.metrics import mean_squared_error
#Train Error
np.round(mean_squared_error(y_selected_train2,y_pred),4)

0.0048

In [33]:
X_selected_valid2 = X_valid[['body_roll','body_yaw','head_roll','head_pitch']]
y_selected_valid2 = y_valid

In [34]:
y_pred=svr_tuned.predict(X_selected_valid2)

In [35]:
#Valid Error
pred = np.round(mean_squared_error(y_selected_valid2,y_pred),4)
pred

0.013

In [36]:
print('RMSE with SVM after using LASSO feature selection : ', np.round(mean_squared_error(y_selected_valid2,y_pred),4))

RMSE with SVM after using LASSO feature selection :  0.013


In [38]:
import pickle
# open a file, where you ant to store the data
file = open('lasso_feature_model.pkl', 'wb')
file3 = open('best_features_LAASO.pkl', 'wb')

lasso_mse = []
lasso_mse.append(np.round(mean_squared_error(y_selected_valid,rf.predict(X_selected_valid)),4))
lasso_mse.append(np.round(mean_squared_error(y_selected_valid2,y_pred),4))

best_features = X_selected_train2.columns

print(best_features)

# dump information to that file
pickle.dump(lasso_mse, file)
pickle.dump(best_features, file3)

Index(['body_pitch', 'body_yaw', 'head_roll', 'head_pitch'], dtype='object')


We got the best result using LAASO feature selection on Random Forest with MSE of 0.0057