In [1]:
import StoredQueries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import DataframeBuilder
import StoredQueries

In [2]:
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type="binary", feature_ratio="None", vehicle_type="year_model", target_type="TotalPartsSold")

In [3]:
vehicle_presence_df.shape

(965, 6321)

In [4]:
print( vehicle_presence_df.isna().any(axis=1).sum())

0


In [5]:
vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)

In [6]:
# vehicle_presence_df.head()

In [7]:
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=['TotalPartsSold', 'Date'])
y = vehicle_presence_df['TotalPartsSold']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
lasso = Lasso(alpha=1)  # You may need to adjust alpha based on your data
lasso.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(lasso, X_scaled, y, cv=20)
print("Cross-validated scores:", cv_scores)
print("Average CV score:", np.mean(cv_scores))
print("Standard deviation of CV scores:", np.std(cv_scores))

Cross-validated scores: [-12.31451965  -1.95686349  -1.04293957  -0.89014461  -0.28085745
   0.04320292  -0.03727358  -0.58592607  -0.29679639  -0.12206784
  -0.05744298   0.04594601  -0.47143828  -0.05259014  -0.10402237
  -0.32348293  -0.49355713   0.04193589  -0.13401139  -0.34964719]
Average CV score: -0.9691248132515616
Standard deviation of CV scores: 2.6441693198351515


In [9]:
# Get the coefficients from the Lasso model
coef = lasso.coef_

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': coef})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Display the top features
print(feature_importance.head(100))

                     feature  importance
239            2000.0_SEPHIA    3.428193
5801          2011.0_KIZASHI    3.111663
5895           2008.0_S-TYPE    2.938090
6266  2022.0_OUTLANDER SPORT    2.414517
5974               2011.0_XD    2.406530
...                      ...         ...
84              1998.0_C1500   -0.000000
85          2001.0_SILVERADO    0.000000
86             1997.0_SIERRA   -0.000000
87              1989.0_C1500    0.000000
88           1997.0_SUBURBAN   -0.000000

[100 rows x 2 columns]


In [10]:
feature_importance.shape

(6319, 2)