In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import DataframeBuilder

In [8]:
target_type = "TotalPrice"
presence_type = "binary"

In [9]:
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model", target_type=target_type)


In [10]:
vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)

In [11]:
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
random_forest = RandomForestRegressor()
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'max_depth': results['param_max_depth'],
    'max_features': results['param_max_features'],
    'min_samples_split': results['param_min_samples_split'],
    'min_samples_leaf': results['param_min_samples_leaf'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

     n_estimators max_depth max_features  min_samples_split  min_samples_leaf  \
0             100      None         auto                  2                 1   
1             200      None         auto                  2                 1   
2             300      None         auto                  2                 1   
3             100      None         auto                  5                 1   
4             200      None         auto                  5                 1   
..            ...       ...          ...                ...               ...   
319           200        30         log2                  5                 4   
320           300        30         log2                  5                 4   
321           100        30         log2                 10                 4   
322           200        30         log2                 10                 4   
323           300        30         log2                 10                 4   

     mean_test_score  std_t

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Ana

In [13]:
target_type = "TotalPrice"
presence_type = "continuous"
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model",
                                                        target_type=target_type)

vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
random_forest = RandomForestRegressor()
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5,
                           scoring='neg_mean_absolute_percentage_error', return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'max_depth': results['param_max_depth'],
    'max_features': results['param_max_features'],
    'min_samples_split': results['param_min_samples_split'],
    'min_samples_leaf': results['param_min_samples_leaf'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Ana

     n_estimators max_depth max_features  min_samples_split  min_samples_leaf  \
0             100      None         auto                  2                 1   
1             200      None         auto                  2                 1   
2             300      None         auto                  2                 1   
3             100      None         auto                  5                 1   
4             200      None         auto                  5                 1   
..            ...       ...          ...                ...               ...   
319           200        30         log2                  5                 4   
320           300        30         log2                  5                 4   
321           100        30         log2                 10                 4   
322           200        30         log2                 10                 4   
323           300        30         log2                 10                 4   

     mean_test_score  std_t

In [14]:
target_type = "TotalPartsSold"
presence_type = "binary"
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model",
                                                        target_type=target_type)

vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
random_forest = RandomForestRegressor()
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5,
                           scoring='neg_mean_absolute_percentage_error', return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'max_depth': results['param_max_depth'],
    'max_features': results['param_max_features'],
    'min_samples_split': results['param_min_samples_split'],
    'min_samples_leaf': results['param_min_samples_leaf'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Ana

     n_estimators max_depth max_features  min_samples_split  min_samples_leaf  \
0             100      None         auto                  2                 1   
1             200      None         auto                  2                 1   
2             300      None         auto                  2                 1   
3             100      None         auto                  5                 1   
4             200      None         auto                  5                 1   
..            ...       ...          ...                ...               ...   
319           200        30         log2                  5                 4   
320           300        30         log2                  5                 4   
321           100        30         log2                 10                 4   
322           200        30         log2                 10                 4   
323           300        30         log2                 10                 4   

     mean_test_score  std_t

In [15]:
target_type = "TotalPartsSold"
presence_type = "continuous"
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model",
                                                        target_type=target_type)

vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and fit the Lasso regression model
random_forest = RandomForestRegressor()
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5,
                           scoring='neg_mean_absolute_percentage_error', return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'max_depth': results['param_max_depth'],
    'max_features': results['param_max_features'],
    'min_samples_split': results['param_min_samples_split'],
    'min_samples_leaf': results['param_min_samples_leaf'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Analysis\.venv\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\aljo9\PycharmProjects\NV_Picapart_Ana

     n_estimators max_depth max_features  min_samples_split  min_samples_leaf  \
0             100      None         auto                  2                 1   
1             200      None         auto                  2                 1   
2             300      None         auto                  2                 1   
3             100      None         auto                  5                 1   
4             200      None         auto                  5                 1   
..            ...       ...          ...                ...               ...   
319           200        30         log2                  5                 4   
320           300        30         log2                  5                 4   
321           100        30         log2                 10                 4   
322           200        30         log2                 10                 4   
323           300        30         log2                 10                 4   

     mean_test_score  std_t

import pandas as pd
# Get the coefficients from the RF Model
coef = random_forest.feature_importances_

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': coef})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Display the top features
print(feature_importance.head(10))