In [5]:
import StoredQueries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# For visualization of quartile results
import matplotlib.pyplot as plt

In [2]:
vehicle_presence_df = StoredQueries.vehicle_presence(feature_ratio=100, vehicle_type="year_model")


In [14]:
vehicle_presence_df.head()

Unnamed: 0,Date,TotalPartsSold,2002.0_PT CRUISER,2006.0_ELANTRA,1999.0_ACCORD,2005.0_ALTIMA,2001.0_ACCORD,2006.0_IMPALA,2000.0_ACCORD,2003.0_ALTIMA,...,2007.0_IMPALA,1999.0_COROLLA,1999.0_EXPEDITION,2000.0_EXPEDITION,2005.0_PT CRUISER,2004.0_PT CRUISER,2004.0_IMPALA,2001.0_MUSTANG,2000.0_DURANGO,2006.0_ALTIMA
0,2017-05-05,570.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2017-05-06,891.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2017-05-07,844.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2017-05-08,606.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2017-05-09,632.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=['TotalPartsSold', 'Date'])
y = vehicle_presence_df['TotalPartsSold']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
lasso = Lasso(alpha=0.1)  # You may need to adjust alpha based on your data
lasso.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(lasso, X_scaled, y, cv=5)
print("Cross-validated scores:", cv_scores)
print("Average CV score:", np.mean(cv_scores))
print("Standard deviation of CV scores:", np.std(cv_scores))

Cross-validated scores: [-0.5699087  -0.09047342 -0.19410732 -0.08908855 -0.11221085]
Average CV score: -0.21115777078138903
Standard deviation of CV scores: 0.1834388445184463


In [11]:
# Get the coefficients from the Lasso model
coef = lasso.coef_

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': coef})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Display the top features
print(feature_importance.head(10))

              feature  importance
16      2007.0_IMPALA   13.370693
3       2005.0_ALTIMA   10.046902
20  2005.0_PT CRUISER    9.416929
14       2000.0_CIVIC    8.798968
25      2006.0_ALTIMA    7.904442
10  1999.0_RAM PICKUP    6.699431
4       2001.0_ACCORD    5.741807
24     2000.0_DURANGO    3.208137
6       2000.0_ACCORD    2.887188
12       1998.0_CAMRY    2.457479


In [13]:
# Rank features and assign quartiles
feature_importance['rank'] = feature_importance['importance'].rank(ascending=False)
feature_importance['quartile'] = pd.qcut(feature_importance['rank'], q=4, labels=[1, 0.75, 0.5, 0.25])

# Display the feature importance with quartiles
print(feature_importance.head(100))

              feature  importance  rank quartile
16      2007.0_IMPALA   13.370693   1.0     1.00
3       2005.0_ALTIMA   10.046902   2.0     1.00
20  2005.0_PT CRUISER    9.416929   3.0     1.00
14       2000.0_CIVIC    8.798968   4.0     1.00
25      2006.0_ALTIMA    7.904442   5.0     1.00
10  1999.0_RAM PICKUP    6.699431   6.0     1.00
4       2001.0_ACCORD    5.741807   7.0     1.00
24     2000.0_DURANGO    3.208137   8.0     0.75
6       2000.0_ACCORD    2.887188   9.0     0.75
12       1998.0_CAMRY    2.457479  10.0     0.75
18  1999.0_EXPEDITION    0.625270  11.0     0.75
0   2002.0_PT CRUISER   -0.000000  12.0     0.75
9       2005.0_IMPALA   -0.333265  13.0     0.75
1      2006.0_ELANTRA   -0.592975  14.0     0.50
23     2001.0_MUSTANG   -0.717004  15.0     0.50
8        1999.0_CAMRY   -0.932154  16.0     0.50
5       2006.0_IMPALA   -1.043839  17.0     0.50
15    2002.0_S SERIES   -1.504037  18.0     0.50
21  2004.0_PT CRUISER   -1.589403  19.0     0.50
19  2000.0_EXPEDITIO