In [1]:
import StoredQueries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

In [2]:
vehicle_presence_df = StoredQueries.vehicle_presence(presence_type="continuous", feature_ratio=100, vehicle_type="year_model")

In [3]:
vehicle_presence_df.shape

(965, 11)

In [4]:
print( vehicle_presence_df.isna().any(axis=1).sum())

0


In [5]:
vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)

In [6]:
vehicle_presence_df.head()

Unnamed: 0,Date,TotalPartsSold,TotalPrice,2005.0_ALTIMA,2002.0_ACCORD,2006.0_ALTIMA,1999.0_CAMRY,2000.0_ACCORD,1999.0_ACCORD,1998.0_ACCORD,2006.0_IMPALA
1414,2021-03-19,28.0,545.52,15,4,10,5,4,8,5,1
1415,2021-03-20,22.0,856.19,11,4,10,5,4,8,5,1
1417,2021-03-22,23.0,508.47,11,4,10,5,4,8,5,1
1418,2021-03-23,10.0,175.9,11,4,10,5,4,8,5,1
1419,2021-03-24,18.0,177.98,11,4,10,5,4,8,5,1


In [8]:
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=['TotalPartsSold', 'TotalPrice', 'Date'])
y = vehicle_presence_df['TotalPartsSold']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
lasso = Lasso(alpha=1)  # You may need to adjust alpha based on your data
lasso.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(lasso, X_scaled, y, cv=20)
print("Cross-validated scores:", cv_scores)
print("Average CV score:", np.mean(cv_scores))
print("Standard deviation of CV scores:", np.std(cv_scores))

Cross-validated scores: [-2.89354726e+01 -1.77244877e+01 -4.44438333e+00 -4.82202738e+00
 -1.64333867e+00 -1.06106095e+00 -7.43625706e-01 -2.30509607e+00
 -1.17002139e+00 -1.03397278e+00 -3.90886518e-01 -4.30030923e-01
  1.06300591e-02 -1.83631944e-03 -1.45607053e+00 -1.43538105e+00
 -2.13435467e+00 -9.35176358e-01 -2.77339043e+00 -1.18097907e+00]
Average CV score: -3.7305481172279906
Standard deviation of CV scores: 6.882039589983511


In [10]:
# Get the coefficients from the Lasso model
coef = lasso.coef_

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': coef})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Display the top features
print(feature_importance.head(100))

         feature  importance
2  2006.0_ALTIMA    4.688834
1  2002.0_ACCORD   -0.000000
7  2006.0_IMPALA    0.000000
3   1999.0_CAMRY   -2.128219
6  1998.0_ACCORD   -5.448527
4  2000.0_ACCORD   -6.746840
5  1999.0_ACCORD   -8.760294
0  2005.0_ALTIMA  -11.596995


In [11]:
feature_importance.shape

(8, 2)