In [5]:
import StoredQueries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# For visualization of quartile results
import matplotlib.pyplot as plt

In [2]:
vehicle_presence_df = StoredQueries.vehicle_presence(presence_type="continuous", feature_ratio=100, vehicle_type="year_model", target_type="TotalPrice")


In [14]:
vehicle_presence_df.head()

In [8]:
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=['TotalPrice', 'Date'])
y = vehicle_presence_df['TotalPrice']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
lasso = Lasso(alpha=0.1)  # You may need to adjust alpha based on your data
lasso.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(lasso, X_scaled, y, cv=5)
print("Cross-validated scores:", cv_scores)
print("Average CV score:", np.mean(cv_scores))
print("Standard deviation of CV scores:", np.std(cv_scores))

In [11]:
# Get the coefficients from the Lasso model
coef = lasso.coef_

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': coef})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Display the top features
print(feature_importance.head(10))