In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
import DataframeBuilder

In [8]:
target_type = "TotalPartsSold"
presence_type = "binary"

In [9]:
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model", target_type=target_type)

In [10]:
vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)

In [11]:
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
gbr = GradientBoostingRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'learning_rate': results['param_learning_rate'],
    'max_depth': results['param_max_depth'],
    'subsample': results['param_subsample'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

    n_estimators  learning_rate  max_depth  subsample  mean_test_score  \
0            100           0.01          3        0.8        -0.416921   
1            100           0.01          3        0.9        -0.415993   
2            100           0.01          3        1.0        -0.417231   
3            200           0.01          3        0.8        -0.347207   
4            200           0.01          3        0.9        -0.346124   
..           ...            ...        ...        ...              ...   
76           200           0.10          5        0.9        -0.351141   
77           200           0.10          5        1.0        -0.350328   
78           300           0.10          5        0.8        -0.362581   
79           300           0.10          5        0.9        -0.359644   
80           300           0.10          5        1.0        -0.355515   

    std_test_score  
0         0.053560  
1         0.053991  
2         0.054236  
3         0.061408  
4     

In [13]:
target_type = "TotalPartsSold"
presence_type = "continuous"
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model",
                                                        target_type=target_type)
vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
gbr = GradientBoostingRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error',
                           return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'learning_rate': results['param_learning_rate'],
    'max_depth': results['param_max_depth'],
    'subsample': results['param_subsample'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

    n_estimators  learning_rate  max_depth  subsample  mean_test_score  \
0            100           0.01          3        0.8        -0.415424   
1            100           0.01          3        0.9        -0.414494   
2            100           0.01          3        1.0        -0.414322   
3            200           0.01          3        0.8        -0.345136   
4            200           0.01          3        0.9        -0.343583   
..           ...            ...        ...        ...              ...   
76           200           0.10          5        0.9        -0.339255   
77           200           0.10          5        1.0        -0.333698   
78           300           0.10          5        0.8        -0.340052   
79           300           0.10          5        0.9        -0.341725   
80           300           0.10          5        1.0        -0.338827   

    std_test_score  
0         0.051048  
1         0.050757  
2         0.050787  
3         0.056964  
4     

In [14]:
target_type = "TotalPrice"
presence_type = "binary"
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model",
                                                        target_type=target_type)
vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
gbr = GradientBoostingRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error',
                           return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'learning_rate': results['param_learning_rate'],
    'max_depth': results['param_max_depth'],
    'subsample': results['param_subsample'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

    n_estimators  learning_rate  max_depth  subsample  mean_test_score  \
0            100           0.01          3        0.8        -0.513319   
1            100           0.01          3        0.9        -0.511524   
2            100           0.01          3        1.0        -0.511877   
3            200           0.01          3        0.8        -0.452392   
4            200           0.01          3        0.9        -0.450463   
..           ...            ...        ...        ...              ...   
76           200           0.10          5        0.9        -0.469887   
77           200           0.10          5        1.0        -0.472357   
78           300           0.10          5        0.8        -0.478235   
79           300           0.10          5        0.9        -0.477695   
80           300           0.10          5        1.0        -0.481579   

    std_test_score  
0         0.129354  
1         0.128361  
2         0.127840  
3         0.145479  
4     

In [15]:
target_type = "TotalPrice"
presence_type = "continuous"
vehicle_presence_df = DataframeBuilder.vehicle_presence(presence_type=presence_type, vehicle_type="year_model",
                                                        target_type=target_type)
vehicle_presence_df.columns = vehicle_presence_df.columns.astype(str)
# Create the machine learning model steps here, including training and testing LR
# Separating features and target
X = vehicle_presence_df.drop(columns=[target_type, 'Date'])
y = vehicle_presence_df[target_type]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
gbr = GradientBoostingRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

# Use GridSearchCV for tuning
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error',
                           return_train_score=True)
grid_search.fit(X_train, y_train)

# Get the results
results = grid_search.cv_results_

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'n_estimators': results['param_n_estimators'],
    'learning_rate': results['param_learning_rate'],
    'max_depth': results['param_max_depth'],
    'subsample': results['param_subsample'],
    'mean_test_score': results['mean_test_score'],
    'std_test_score': results['std_test_score']
})

print(results_df)

    n_estimators  learning_rate  max_depth  subsample  mean_test_score  \
0            100           0.01          3        0.8        -0.509976   
1            100           0.01          3        0.9        -0.508042   
2            100           0.01          3        1.0        -0.508248   
3            200           0.01          3        0.8        -0.445118   
4            200           0.01          3        0.9        -0.444508   
..           ...            ...        ...        ...              ...   
76           200           0.10          5        0.9        -0.426793   
77           200           0.10          5        1.0        -0.419745   
78           300           0.10          5        0.8        -0.432215   
79           300           0.10          5        0.9        -0.430110   
80           300           0.10          5        1.0        -0.425384   

    std_test_score  
0         0.124313  
1         0.121538  
2         0.120981  
3         0.126739  
4     

# Get the coefficients from the GBR model
coef = gbr.feature_importances_

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': coef})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Display the top features
print(feature_importance.head(10))