In [2]:
# Importing data
import pandas as pd

file_id = '1DP3G49DWVUaRBkn7wssPi6CfY1pTmTQP'
url = f'https://drive.google.com/uc?id={file_id}'

df = pd.read_csv(url)

In [3]:
# Getting original data
df = pd.read_csv(url)

# Assigning data split
y = df['Borg']
X = df.drop('Borg', axis = 1)

# Train Test Split
from sklearn.model_selection import train_test_split

# Splitting with test_size = 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Imports
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline




In [5]:
# Initializing and Training a Regressor for SelectFromModel
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)


# Creating pipeline with Feature Selection
regressor_model = Pipeline([
    ('feature_sel', SelectFromModel(gbr)),
    ('regressor', GradientBoostingRegressor())
])

# Parameter grid
params_grid = {

    # Feature Selection Parameter
    'feature_sel__threshold' : ["mean", "median"], # out of 3 runs, we choose mean

    # Regressor Parameter
    'regressor__n_estimators' : [100, 200], # High = better accuracy but overfits
    'regressor__max_features' : [0.5, 0.1], # 0.5, 'sqrt', 'log2', 0.1, 
    'regressor__min_samples_leaf' : [10, 30], # 10
    'regressor__max_depth' : [8, 30], # 8
    'regressor__subsample' : [0.7, 0.8], # 0.8
    'regressor__min_samples_split' : [20, 30]
}

In [6]:
from sklearn.model_selection import GridSearchCV

gbr_CV = GridSearchCV(regressor_model, params_grid, cv = 3, n_jobs = 3, scoring = 'neg_mean_squared_error')



In [7]:
# Fitting into training data
gbr_CV.fit(X_train, y_train)

# Printing Scores
print('The best GridScore is', gbr_CV.best_score_)
print('The best parameters are', gbr_CV.best_params_)

# Retraining model with best parameters
regressor_model.set_params(**gbr_CV.best_params_)
regressor_model.fit(X_train, y_train)

# Getting predictions 
y_pred = regressor_model.predict(X_test)

The best GridScore is -2.5025215173260253
The best parameters are {'feature_sel__threshold': 'mean', 'regressor__max_depth': 30, 'regressor__max_features': 0.5, 'regressor__min_samples_leaf': 30, 'regressor__min_samples_split': 20, 'regressor__n_estimators': 200, 'regressor__subsample': 0.8}


In [None]:
# Getting Metrics
training_score = regressor_model.score(X_train, y_train)
testing_score = regressor_model.score(X_test, y_test)
meanSquaredError = mean_squared_error(y_test, y_pred)

print("Training Score is ", training_score)
print("Testing Score is,", testing_score)
print("Mean Squared Error is", meanSquaredError)

# 300 minutes

### Gradient Boosting Regressor (GBR)
Gradient Boosting Regressor (GBR) is a type of ensemble machine learning technique. Ensemble learning combines multiple machine learning model to produce better accuracies compared to a single model. GBR uses multiple decision trees where each new tree is trained to reduce the remaining error via residual errors.

### Brief Introduction
In our parameters, n_estimator is the number of tree we add into the model. We start of with 50 to 150 n_estimators. Increasing the number of trees only slightly increases accuracy. We set it at 200 is where we see the largest jump in accuracy. The training score is especially high followed with a lower testing score. The rest of the parameter now serves to reduce overfitting.

### Feature Selection
The implementation of feature selection, SelectFromModel(), serves to reduce dimensionalty by selecting features by scores from a pre-trained model and compares it to the threshold specified. 

### Hyperparameter Tuning
The parameters chosen serve to increase accuracy. Although overfitting is an obvious issue here,

### GridSearch
We then perform a grid search with cross validation of 3 because of our huge dataset and it reduces computation time. 