In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/blocks_clustered.csv")
df['cluster'] = df['cluster'].astype('category')
train_set, test_set = train_test_split(df, test_size = 0.2, random_state=42)
X_train = train_set.loc[:, ['median_age', 'total_pop', 'white_pct', 'median_income', 'college_degree_pct', 'unemploy_pct']]
y_train = train_set["cluster"]
X_test = test_set.loc[:, ['median_age', 'total_pop', 'white_pct', 'median_income', 'college_degree_pct', 'unemploy_pct']]
y_test = test_set["cluster"]

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5554 entries, 0 to 5553
Data columns (total 18 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   cluster                            5554 non-null   category
 1   geoid                              5554 non-null   int64   
 2   name                               5554 non-null   object  
 3   median_age                         5554 non-null   float64 
 4   white_pct                          5554 non-null   float64 
 5   total_pop                          5554 non-null   int64   
 6   median_income                      5554 non-null   float64 
 7   college_degree_pct                 5554 non-null   float64 
 8   unemploy_pct                       5554 non-null   float64 
 9   latitude                           5554 non-null   float64 
 10  longitude                          5554 non-null   float64 
 11  geometry                           5554 non

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
#         ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),  # refer to https://stackoverflow.com/questions/54836051/pipeline-ordinalencoder-valueerror-found-unknown-categories
])

from sklearn.compose import ColumnTransformer

num_attribs = list(X_train.columns[X_train.dtypes != "object"])
cat_attribs = list(X_train.columns[X_train.dtypes == "object"])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

X = full_pipeline.fit_transform(X_train)

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {'n_estimators': [120, 150, 200, 250, 300], 'max_features': [1, 2, 3, 4]},
  ]

forest_reg = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X, y_train)

In [23]:
final_model = grid_search.best_estimator_
grid_search.best_params_

{'max_features': 1, 'n_estimators': 150}

In [26]:
pd.Series(final_model.feature_importances_, 
          index = ['median_age', 'total_pop', 'white_pct', 'median_income', 'college_degree_pct', 'unemploy_pct'])

median_age            0.394264
total_pop             0.115868
white_pct             0.148817
median_income         0.119345
college_degree_pct    0.130212
unemploy_pct          0.091493
dtype: float64

In [28]:
cvres = pd.DataFrame(grid_search.cv_results_)
print("5-fold accuracy of tuned model is: " + str( min(cvres["mean_test_score"]) ) \
      + " with parameters " + str(grid_search.best_params_))

5-fold accuracy of tuned model is: 0.702001439009313 with parameters {'max_features': 1, 'n_estimators': 150}


In [30]:
from sklearn.metrics import mean_squared_error

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

test_accuracy = (y_test == final_predictions).sum() / len(y_test)

print("The final RMSE in (self-made) testing set is: " + str( test_accuracy ))

The final RMSE in (self-made) testing set is: 0.711971197119712
