In [10]:
# This is the final test of the LASSO model using the tuning parameters discovered during the tuning and training process. It will be used to compare coefficients of the model to a list of high-yield vehicles using domain knowledge provided by Luke.

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import DataframeBuilder

In [12]:
target_type_list = ["TotalPrice", "TotalPartsSold"]
presence_type = "continuous"
quantile_threshold_list = [0.1] # Controls the proportion of the lowest popularity vehicle types to be dropped
imputation_multiplier_list = [4] # Controls the number of times the dataframe records are duplicated
for target_type in target_type_list:
    for quantile_threshold in quantile_threshold_list:
        for imputation_multiplier in imputation_multiplier_list:
            
            vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model", target_type=target_type)
            
            # Create additional records based on the imputation multiplier
            vehicle_presence_df = pd.concat([vehicle_presence_df] * imputation_multiplier, ignore_index=True)
            
            if presence_type == "continuous":
                
              
                if 'TotalPrice' in vehicle_presence_df.columns:
                    sums = vehicle_presence_df.drop(columns=['TotalPrice', 'Date']).sum()
                    
                elif 'TotalPartsSold' in vehicle_presence_df.columns:
                    sums = vehicle_presence_df.drop(columns=['TotalPartsSold', 'Date']).sum()
                
                
                try:
                    # Determine the threshold for the bottom x% of sums
                    threshold = sums.quantile(quantile_threshold)
                
            
                    # Find columns to drop
                    cols_to_drop = sums[sums <= threshold].index.tolist()
            
                    # Drop columns from the dataframe
                    vehicle_presence_df = vehicle_presence_df.drop(columns=cols_to_drop)
                    
                except ValueError:
                    print("Sums not found, TotalPrice and TotalPartsSold are not available in vehicle_presence_df.")
                    
            vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
            
            # Create the machine learning model steps here, including training and testing LR
            # Separating features and target
            X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
            y = vehicle_presence_df[target_type]
            
            # Normalize the features
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            # Split the data for training and testing
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
            
            # Initialize and fit the Lasso regression model
            lasso = Lasso(alpha=0.001, random_state=42)
            lasso.fit(X_train, y_train)
            y_pred = lasso.predict(X_test)
            
            # TODO make sure this works as intended with coefficients paired with their feature name properly
            coefficient_df = pd.DataFrame({
            'Vehicle Year-Model': X.columns,
            'Coefficient': lasso.coef_
            })
            coefficient_df['Ranking'] = coefficient_df['Coefficient'].rank(ascending=False, method='min')
            coefficient_df = coefficient_df.sort_values(by='Ranking', ascending=False)
            coefficient_df.to_csv(f"LASSO_{target_type}_Final.csv", index=False)
            
            
            
         
            
           
            
        

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
