In [1]:
# Imports
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Define statistical accuracy display functions

def printMAPE(test_features, test_label, model):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_label)
    print("Mean Absolute Error: " np.mean(errors))
    pct_errors = 100 * (errors / test_label)
    print("Mean Absolute Percentage Error: ", np.mean(pct_errors))

def printMAPE_train(train_features, train_label, model):
    predictions = model.predict(train_features)
    errors = abs(predictions - train_label)
    print("Training Mean Absolute Error: ", np.mean(errors))
    pct_errors = 100 * (errors / train_label)
    print("Training Mean Absolute Percentage Error: ", np.mean(pct_errors))

def getEnsembleTreeVars(ensTree, varNames):
    importance = ensTree.feature_importances_
    index = np.argsort(importance)
    vars = []
    for i in index:
        imp_val = importance[i]
        if imp_val > np.average(importance):
            v = int(imp_val / np.max(importance) * 100)
            vars.append(varNames[i])
    vars = sorted(vars, key = itemgetter(1), reverse = True)
    return vars

In [None]:
# Set up train-test split on dataset

df = pd.read_csv("dataset_schools_avg.csv")
labels = np.array(df["Sparta score"])
features = df.drop(["Sparta score", "School District"], axis = 1)

feature_list = list(features.columns)
features = np.array(features)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [None]:
# Run random forest regressor with 1,000 decision trees on training data

rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf = rf.fit(X_train, y_train)
printMAPE(X_test, y_test, rf)
printMAPE_train(X_train, y_train, rf)

In [None]:
rf_importantvars = getEnsembleTreeVars(rf, feature_list)
print(rf_importantvars)

rf_bestvars = RandomForestRegressor(n_estimators = 1000, random_state = 42)
labels_bestvars = np.array(df["Sparta score"])
features_bestvars = df.drop(["Sparta score", "School District", "% White", "% Speaks English only", "% Below poverty line", "Median Household Income of Parents", "% Married-couple families", "% Food Stamp benefits", "% Health Insurance Coverage"], axis = 1)

feature_list = list(features_bestvars.columns)
features = np.array(features_bestvars)

X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(features, labels, test_size = 0.2, random_state = 42)

rf_bestvars = rf_bestvars.fit(X_train_best, y_train_best)
printMAPE(X_test_best, y_test_best, rf_bestvars)
printMAPE_train(X_train_best, y_train_best, rf_bestvars)

In [None]:
# Gradient boosting attempt

from sklearn import ensemble

params = {
    "n_estimators": 1000,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

gbr = ensemble.GradientBoostingRegressor(**params)
gbr.fit(X_train, y_train)

printMAPE(X_test, y_test, gbr)
printMAPE_train(X_train, y_train, gbr)

test_score = np.zeros((params["n_estimators"]), dtype = np.float64)
for i, y_pred in enumerate(gbr.staged_predict(X_test)):
    test_score[i] = gbr.loss_(y_test, y_pred)

In [None]:
# Hyperparameter tuning
rf.get_params()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1100, num = 10)]

# Number of features to consider at every split
max_features = ["auto", "sqrt"]

# Maximum tree depth
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at leaf node
min_samples_leaf = [1, 2, 4]

# Method for selecting training samples
bootstrap = [True, False]

grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "bootstrap": bootstrap
}

grid_gbr = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Tuning model 1
rf_grid = RandomizedSearchCV(estimator = rf, param_grid = grid, n_jobs = -1, cv = 2, verbose = 2)
rf_grid.fit(X_train, y_train)
rf_grid.best_estimator_.get_params()

rf_grid = GridSearchCV(estimator = rf, param_grid = grid, n_jobs = -1, cv = 2, verbose = 2)
rf_grid.fit(X_train, y_train)
rf_grid.best_estimator_.get_params()

# Tuning model 2
rf_grid_best = RandomizedSearchCV(estimator = rf_bestvars, param_grid = grid, n_jobs = -1, cv = 2, verbose = 2)
rf_grid_best.fit(X_train_best, y_train_best)
rf_grid_best.best_estimator_.get_params()

rf_grid_best = GridSearchCV(estimator = rf_bestvars, param_grid = grid, n_jobs = -1, cv = 2, verbose = 2)
rf_grid.fit(X_train_best, y_train_best)
rf_grid_best.best_estimator_.get_params()

gbr_grid = GridSearchCV(estimator = gbr, param_grid = grid_gbr, n_jobs = -1, cv = 2, verbose = 2)
gbr_grid.fit(X_train, y_train)
printMAPE(X_test, y_test, gbr_grid.best_estimator_)
printMAPE_train(X_train, y_train, gbr_grid.best_estimator_)