In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import DataframeBuilder

In [2]:
target_type_list = ["TotalPartsSold"]
presence_type = "continuous"
quantile_threshold_list = [0.1]  # Controls the proportion of the lowest popularity vehicle types to be dropped
imputation_multiplier_list = [2, 3, 4]  # Controls the number of times the dataframe records are duplicated
for target_type in target_type_list:
    for quantile_threshold in quantile_threshold_list:
        for imputation_multiplier in imputation_multiplier_list:
            
            vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model", target_type=target_type)
            
            # Create additional records based on the imputation multiplier
            vehicle_presence_df = pd.concat([vehicle_presence_df] * imputation_multiplier, ignore_index=True)
            
            if presence_type == "continuous":
                
              
                if 'TotalPrice' in vehicle_presence_df.columns:
                    sums = vehicle_presence_df.drop(columns=['TotalPrice', 'Date']).sum()
                    
                elif 'TotalPartsSold' in vehicle_presence_df.columns:
                    sums = vehicle_presence_df.drop(columns=['TotalPartsSold', 'Date']).sum()
                
                
                try:
                    # Determine the threshold for the bottom x% of sums
                    threshold = sums.quantile(quantile_threshold)
                
            
                    # Find columns to drop
                    cols_to_drop = sums[sums <= threshold].index.tolist()
            
                    # Drop columns from the dataframe
                    vehicle_presence_df = vehicle_presence_df.drop(columns=cols_to_drop)
                    
                except ValueError:
                    print("Sums not found, TotalPrice and TotalPartsSold are not available in vehicle_presence_df.")
                    
            vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
            
            # Create the machine learning model steps here, including training and testing LR
            # Separating features and target
            X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
            y = vehicle_presence_df[target_type]
            
            # Normalize the features
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            # Split the data for training and testing
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
            
            # Initialize and fit the Lasso regression model
            random_forest = RandomForestRegressor()
            # Define the parameter grid
            param_grid = {
                  'n_estimators': [100, 200, 300],
                  'max_depth': [10, 20],
                  'max_features': ['log2'],
                  'min_samples_split': [10],
                  'min_samples_leaf': [4]
            }
            
            # Use GridSearchCV for tuning
            grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', return_train_score=True)
            grid_search.fit(X_train, y_train)
            
            # Get the results
            results = grid_search.cv_results_
            
            # Create a DataFrame to display results
            results_df = pd.DataFrame({
                'n_estimators': results['param_n_estimators'],
                'max_depth': results['param_max_depth'],
                'max_features': results['param_max_features'],
                'min_samples_split': results['param_min_samples_split'],
                'min_samples_leaf': results['param_min_samples_leaf'],
                'mean_test_score': results['mean_test_score'],
                'std_test_score': results['std_test_score']
            })
            
            results_df.to_csv(f'{target_type}_{quantile_threshold}_{imputation_multiplier}_RF_Results.csv', index=False)

In [3]:
target_type_list = ["TotalPrice"]
presence_type = "continuous"
quantile_threshold_list = [0.1]  # Controls the proportion of the lowest popularity vehicle types to be dropped
imputation_multiplier_list = [2, 3, 4]  # Controls the number of times the dataframe records are duplicated
for target_type in target_type_list:
    for quantile_threshold in quantile_threshold_list:
        for imputation_multiplier in imputation_multiplier_list:
            
            vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model", target_type=target_type)
            
            # Create additional records based on the imputation multiplier
            vehicle_presence_df = pd.concat([vehicle_presence_df] * imputation_multiplier, ignore_index=True)
            
            if presence_type == "continuous":
                
              
                if 'TotalPrice' in vehicle_presence_df.columns:
                    sums = vehicle_presence_df.drop(columns=['TotalPrice', 'Date']).sum()
                    
                elif 'TotalPartsSold' in vehicle_presence_df.columns:
                    sums = vehicle_presence_df.drop(columns=['TotalPartsSold', 'Date']).sum()
                
                
                try:
                    # Determine the threshold for the bottom x% of sums
                    threshold = sums.quantile(quantile_threshold)
                
            
                    # Find columns to drop
                    cols_to_drop = sums[sums <= threshold].index.tolist()
            
                    # Drop columns from the dataframe
                    vehicle_presence_df = vehicle_presence_df.drop(columns=cols_to_drop)
                    
                except ValueError:
                    print("Sums not found, TotalPrice and TotalPartsSold are not available in vehicle_presence_df.")
                    
            vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
            
            # Create the machine learning model steps here, including training and testing LR
            # Separating features and target
            X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
            y = vehicle_presence_df[target_type]
            
            # Normalize the features
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            # Split the data for training and testing
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
            
            # Initialize and fit the Lasso regression model
            random_forest = RandomForestRegressor()
            # Define the parameter grid
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 20, 30],
                'max_features': ['log2'],
                'min_samples_split': [5, 10],
                'min_samples_leaf': [4]
            }
            
            # Use GridSearchCV for tuning
            grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', return_train_score=True)
            grid_search.fit(X_train, y_train)
            
            # Get the results
            results = grid_search.cv_results_
            
            # Create a DataFrame to display results
            results_df = pd.DataFrame({
                'n_estimators': results['param_n_estimators'],
                'max_depth': results['param_max_depth'],
                'max_features': results['param_max_features'],
                'min_samples_split': results['param_min_samples_split'],
                'min_samples_leaf': results['param_min_samples_leaf'],
                'mean_test_score': results['mean_test_score'],
                'std_test_score': results['std_test_score']
            })
            
            results_df.to_csv(f'{target_type}_{quantile_threshold}_{imputation_multiplier}_RF_Results.csv', index=False)