In [48]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from numpy import set_printoptions
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder

In [76]:
ames_data = pd.read_csv("AmesHousing.csv")
#only looking at numerical data for now to make things simple
numerical_columns = ames_data.describe().columns
numerical_ames_data = ames_data[numerical_columns]
numerical_ames_data = numerical_ames_data.dropna()

Next three cells are just preprocessing stuff I'm playing around with, not a big deal for now

In [105]:
is_na_mask = ames_data.isna().sum()
missing_columns = is_na_mask[is_na_mask > 1000].index

In [108]:
ames_data.drop(columns = missing_columns, inplace = True)

In [110]:
ames_data.dropna(inplace=True)

In [3]:
feature_columns = numerical_columns[:len(numerical_columns)-1]
feature_vectors = np.array(numerical_ames_data[feature_columns])
class_labels = np.array(numerical_ames_data["SalePrice"])

Used a univariate selection for the feature extraction step:
https://machinelearningmastery.com/feature-selection-machine-learning-python/

In [40]:
test = SelectKBest(score_func=f_regression, k=15)
fit = test.fit(feature_vectors, class_labels)

In [41]:
features = fit.transform(feature_vectors)

In [42]:
univariate_best_features = list(numerical_ames_data.columns[fit.get_support(indices=True)])

But just to be sure of the best features, we also want to use a random forest method to extract features.

In [134]:
#initial split
total_x_train, total_x_test, total_y_train, total_y_test = train_test_split(feature_vectors, class_labels, test_size=0.25)

In [135]:
feat_select_regr = RandomForestRegressor(n_estimators = 100, random_state=0)
feat_select_regr.fit(total_x_train, total_y_train)
important_features = list(zip(numerical_columns,feat_select_regr.feature_importances_))
important_features.sort(key = lambda pair: pair[1], reverse = True)
random_f_best_features = [pair[0] for pair in important_features[0:15]]

This is the intersection between the features selected from the random forest method and the univariate selection method.

In [136]:

total_best_features = list(set(univariate_best_features) & set(random_f_best_features))

Now, onto the Random Forest Prediction!

In [137]:
best_features_df = numerical_ames_data[total_best_features]

In [143]:
X_train, X_test, y_train, y_test = train_test_split(best_features_df, class_labels, test_size=0.25)

In [144]:
regr = RandomForestRegressor(n_estimators=1000, random_state=0)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [145]:
root_mean_squared_error = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
r_squared_value = metrics.r2_score(y_test, y_pred)