In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys

# KNN决策树

KNN Decision Tree

In [None]:
sys.path.append(os.getcwd() + "/helperfunctions")

from preprocfunc import OutlierTrans

pd.set_option('display.width', 78)
pd.set_option('display.max_columns', 12)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

# load the land temperatures data
un_income_gap = pd.read_csv("data/un_income_gap.csv")
un_income_gap.set_index('country', inplace=True)
un_income_gap['incomeratio'] = \
  un_income_gap.femaleincomepercapita / \
    un_income_gap.maleincomepercapita
un_income_gap['educratio'] = \
  un_income_gap.femaleyearseducation / \
     un_income_gap.maleyearseducation
un_income_gap['laborforcepartratio'] = \
  un_income_gap.femalelaborforceparticipation / \
     un_income_gap.malelaborforceparticipation
un_income_gap['humandevratio'] = \
  un_income_gap.femalehumandevelopment / \
     un_income_gap.malehumandevelopment
un_income_gap.dropna(subset=['incomeratio'])

num_cols = ['educratio','laborforcepartratio','humandevratio',
  'genderinequality','maternalmortaility',
  'adolescentbirthrate', 'femaleperparliament','incomepercapita']

gap_sub = un_income_gap[['incomeratio'] + num_cols]

gap_sub.head()

gap_sub.\
  agg(['count','min','median','max']).T
  
# show a heatmap of correlations
corrmatrix = gap_sub.corr(method="pearson")
corrmatrix

sns.heatmap(corrmatrix, xticklabels=corrmatrix.columns,
  yticklabels=corrmatrix.columns, cmap="coolwarm")
plt.title('Heat Map of Correlation Matrix')
plt.tight_layout()
plt.show()
  
# create training and testing DataFrames
X_train, X_test, y_train, y_test =  \
  train_test_split(gap_sub[num_cols],\
  gap_sub[['incomeratio']], test_size=0.2, random_state=0)


# construct a pipeline with preprocessing, feature selection, and knn model
knnreg = KNeighborsRegressor()

feature_sel = SelectFromModel(LinearRegression(), threshold="0.8*mean")

pipe1 = make_pipeline(OutlierTrans(3), \
  SimpleImputer(strategy="median"), StandardScaler(), \
  feature_sel, knnreg, memory=None)

knnreg_params = {
 'kneighborsregressor__n_neighbors': \
     np.arange(3, 21, 2),
 'kneighborsregressor__metric': \
     ['euclidean','manhattan','minkowski']
}

# do a randmoized parameter search
rs = RandomizedSearchCV(pipe1, knnreg_params, cv=4, n_iter=20, \
  scoring='neg_mean_absolute_error', random_state=1, error_score='raise')
rs.fit(X_train, y_train)

rs.best_params_
rs.best_score_

selected = rs.best_estimator_['selectfrommodel'].get_support()
np.array(num_cols)[selected]

rs.best_estimator_['selectfrommodel'].\
  get_feature_names_out(np.array(num_cols))

results = \
  pd.DataFrame(rs.cv_results_['mean_test_score'], \
    columns=['meanscore']).\
  join(pd.DataFrame(rs.cv_results_['params']), how="left", on=None, validate="many_to_many").\
  sort_values(['meanscore'], ascending=False)

results.head(3).T

# get predictions and residuals
pred = rs.predict(X_test)

preddf = pd.DataFrame(pred, columns=['prediction'],
  index=X_test.index).join(X_test, how="left", on=None, validate="many_to_many").join(y_test, how="left", on=None, validate="many_to_many")

preddf['resid'] = preddf.incomeratio-preddf.prediction

preddf.resid.agg(['mean','median','skew','kurtosis'])

plt.hist(preddf.resid, color="blue", bins=5)
plt.axvline(preddf.resid.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title("Histogram of Residuals for Income Ratio Model")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.xlim()
plt.show()

plt.scatter(preddf.prediction, preddf.resid, color="blue")
plt.axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.title("Scatterplot of Predictions and Residuals")
plt.xlabel("Predicted Income Ratio")
plt.ylabel("Residuals")
plt.show()

preddf.loc[np.abs(preddf.resid)>=0.1,
  ['incomeratio','prediction','resid','laborforcepartratio',
  'humandevratio']].T


## 决策树回归

decision tree regression

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

# load the land temperatures data
un_income_gap = pd.read_csv("data/un_income_gap.csv")
un_income_gap.set_index('country', inplace=True)
un_income_gap['incomeratio'] = \
  un_income_gap.femaleincomepercapita / \
    un_income_gap.maleincomepercapita
un_income_gap['educratio'] = \
  un_income_gap.femaleyearseducation / \
     un_income_gap.maleyearseducation
un_income_gap['laborforcepartratio'] = \
  un_income_gap.femalelaborforceparticipation / \
     un_income_gap.malelaborforceparticipation
un_income_gap['humandevratio'] = \
  un_income_gap.femalehumandevelopment / \
     un_income_gap.malehumandevelopment
un_income_gap.dropna(subset=['incomeratio'], inplace=True)


num_cols = ['educratio','laborforcepartratio','humandevratio',
  'genderinequality','maternalmortaility',
  'adolescentbirthrate', 'femaleperparliament','incomepercapita']

gap_sub = un_income_gap[['incomeratio'] + num_cols]

# create training and testing DataFrames
X_train, X_test, y_train, y_test =  \
  train_test_split(gap_sub[num_cols],\
  gap_sub[['incomeratio']], test_size=0.2, random_state=0)

# construct a pipeline with preprocessing and knn model
dtreg_example = DecisionTreeRegressor(min_samples_leaf=5,
  max_depth=3)

pipe0 = make_pipeline(OutlierTrans(3),
  SimpleImputer(strategy="median"))

X_train_imp = pipe0.fit_transform(X_train)

dtreg_example.fit(X_train_imp, y_train)

plot_tree(dtreg_example, feature_names=X_train.columns,
  label="root", fontsize=10)

# construct a decision tree model
dtreg = DecisionTreeRegressor()

feature_sel = SelectFromModel(LinearRegression(),
  threshold="0.8*mean")

pipe1 = make_pipeline(OutlierTrans(3),
  SimpleImputer(strategy="median"),
  feature_sel, dtreg)

dtreg_params={
 "decisiontreeregressor__max_depth": np.arange(2, 20),
 "decisiontreeregressor__min_samples_leaf": np.arange(5, 11)
}

rs = RandomizedSearchCV(pipe1, dtreg_params, cv=4, n_iter=20,
  scoring='neg_mean_absolute_error', random_state=1)
rs.fit(X_train, y_train.values.ravel())

rs.best_params_
rs.best_score_

# construct a random forest model
rfreg = RandomForestRegressor()

rfreg_params = {
 'randomforestregressor__max_depth': np.arange(2, 20),
 'randomforestregressor__max_features': ['auto', 'sqrt'],
 'randomforestregressor__min_samples_leaf':  np.arange(5, 11)
}

pipe2 = make_pipeline(OutlierTrans(3), 
  SimpleImputer(strategy="median"),
  feature_sel, rfreg)

rs = RandomizedSearchCV(pipe2, rfreg_params, cv=4, n_iter=20,
  scoring='neg_mean_absolute_error', random_state=1)
rs.fit(X_train, y_train.values.ravel())

rs.best_params_
rs.best_score_

# get predictions and residuals
pred = rs.predict(X_test)


preddf = pd.DataFrame(pred, columns=['prediction'],
  index=X_test.index).join(X_test).join(y_test, how="left", on=None, validate="many_to_many")

preddf['resid'] = preddf.incomeratio-preddf.prediction


plt.hist(preddf.resid, color="blue", bins=5)
plt.axvline(preddf.resid.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title("Histogram of Residuals for Income Ratio")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.xlim()
plt.show()


plt.scatter(preddf.prediction, preddf.resid, color="blue")
plt.axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.title("Scatterplot of Predictions and Residuals")
plt.xlabel("Predicted Income Ratio")
plt.ylabel("Residuals")
plt.show()

preddf.loc[np.abs(preddf.resid)>=0.12,
  ['incomeratio','prediction','resid',
  'laborforcepartratio', 'humandevratio']].T

## 梯度增强决策树

gradient boosted decision tree

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from scipy.stats import randint
from scipy.stats import uniform

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans


pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

# load the land temperatures data
housing = pd.read_csv("data/kc_house_data.csv")
housing.set_index('id', inplace=True)

num_cols = ['bedrooms','bathrooms','sqft_living','sqft_lot',
  'floors','view','condition','sqft_above','sqft_basement',
  'yr_built','yr_renovated','sqft_living15','sqft_lot15']
cat_cols = ['waterfront']

housing[['price'] + num_cols + cat_cols].\
  head(3).T

housing[['price'] + num_cols].\
  agg(['count','min','median','max']).T
  

plt.hist(housing.price/1000)
plt.title("Housing Price (in thousands)")
plt.xlabel('Price')
plt.ylabel("Frequency")
plt.show()

housing['price_log'] = np.log(housing['price'])

plt.hist(housing.price_log)
plt.title("Housing Price Log")
plt.xlabel('Price Log')
plt.ylabel("Frequency")
plt.show()

housing[['price','price_log']].agg(['kurtosis','skew'])

# look at some correlations
corrmatrix = housing[['price_log'] + num_cols].\
   corr(method="pearson")

sns.heatmap(corrmatrix, xticklabels=corrmatrix.columns,
  yticklabels=corrmatrix.columns, cmap="coolwarm")
plt.title('Heat Map of Correlation Matrix')
plt.tight_layout()
plt.show()


# generate some summary statistics
target = housing[['price_log']]
features = housing[num_cols + cat_cols]

X_train, X_test, y_train, y_test =  \
  train_test_split(features,\
  target, test_size=0.2, random_state=0)
      
# setup pipelines for column transformation
ohe = OneHotEncoder(drop='first', sparse_output=False)

standtrans = make_pipeline(OutlierTrans(2),
  SimpleImputer(strategy="median"),
  MinMaxScaler())
cattrans = make_pipeline(ohe)
coltrans = ColumnTransformer(
  transformers=[
    ("stand", standtrans, num_cols),
    ("cat", cattrans, cat_cols)
  ]
)

# construct a gradient boosted regressor
gbr = GradientBoostingRegressor(random_state=0)

feature_sel = SelectFromModel(LinearRegression(),
  threshold="0.6*mean")

gbr_params = {
 'gradientboostingregressor__learning_rate': uniform(loc=0.01, scale=0.5),
 'gradientboostingregressor__n_estimators': randint(500, 2000),
 'gradientboostingregressor__max_depth': randint(2, 20),
 'gradientboostingregressor__min_samples_leaf': randint(5, 11)
}

pipe1 = make_pipeline(coltrans, feature_sel, gbr)

rs1 = RandomizedSearchCV(pipe1, gbr_params, cv=5, n_iter=20,
  scoring='neg_mean_squared_error', n_jobs=-1, random_state=0)

rs1.fit(X_train, y_train.values.ravel())

rs1.best_params_
rs1.best_score_

y_test.mean()

print("fit time: %.3f, score time: %.3f"  %
  (np.mean(rs1.cv_results_['mean_fit_time']),\
  np.mean(rs1.cv_results_['mean_score_time'])))

# construct an XGBoost model    
xgb = XGBRegressor()

xgb_params = {
 'xgbregressor__learning_rate': uniform(loc=0.01, scale=0.5),
 'xgbregressor__n_estimators': randint(500, 2000),
 'xgbregressor__max_depth': randint(2, 20)
}

pipe2 = make_pipeline(coltrans, feature_sel, xgb)

rs2 = RandomizedSearchCV(pipe2, xgb_params, cv=5, n_iter=20,
  scoring='neg_mean_squared_error', n_jobs=-1, random_state=0)

rs2.fit(X_train, y_train.values.ravel())

rs2.best_params_
rs2.best_score_

print("fit time: %.3f, score time: %.3f"  %
  (np.mean(rs2.cv_results_['mean_fit_time']),\
  np.mean(rs2.cv_results_['mean_score_time'])))

## 梯度提升决策树

gradient boosted decision treekeep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor

from scipy.stats import randint
from scipy.stats import uniform

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

# load the land temperatures data
fftaxrate14 = pd.read_csv("data/fossilfueltaxrate14.csv")
fftaxrate14.set_index('countrycode', inplace=True)
fftaxrate14.info()

# setup the features and target
num_cols = ['fuel_income_dependence','national_income_per_cap',
  'VAT_Rate',  'gov_debt_per_gdp','polity','goveffect',
  'democracy_index']
dummy_cols = ['democracy_polity','autocracy_polity','democracy',
  'nat_oil_comp','nat_oil_comp_state']
spec_cols = ['motorization_rate']

# generate some summary statistics
fftaxrate14[['gas_tax_imp'] + num_cols + spec_cols].\
  agg(['count','min','median','max']).T
fftaxrate14[dummy_cols].apply(pd.value_counts, normalize=True).T

target = fftaxrate14[['gas_tax_imp']]
features = fftaxrate14[num_cols + dummy_cols + spec_cols]

X_train, X_test, y_train, y_test =  \
  train_test_split(features,\
  target, test_size=0.2, random_state=0)
      
# setup pipelines for column transformation
standtrans = make_pipeline(OutlierTrans(2), SimpleImputer(strategy="median"))
cattrans = make_pipeline(SimpleImputer(strategy="most_frequent"))
spectrans = make_pipeline(OutlierTrans(2))
coltrans = ColumnTransformer(
  transformers=[
    ("stand", standtrans, num_cols),
    ("cat", cattrans, dummy_cols),
    ("spec", spectrans, spec_cols)
  ]
)

# construct a decision tree model
gbr = GradientBoostingRegressor(random_state=0)

feature_sel = SelectFromModel(LinearRegression(),
  threshold="0.8*mean")

gbr_params = {
 'gradientboostingregressor__learning_rate': uniform(loc=0.1, scale=0.5),
 'gradientboostingregressor__n_estimators': randint(100, 1000),
 'gradientboostingregressor__max_depth': np.arange(2, 20),
 'gradientboostingregressor__min_samples_leaf': np.arange(5, 11)
}

pipe1 = make_pipeline(OutlierTrans(3),
  SimpleImputer(strategy="median"),
  feature_sel, gbr)

rs = RandomizedSearchCV(pipe1, gbr_params, cv=4, n_iter=20,
  scoring='neg_mean_absolute_error', random_state=1)
rs.fit(X_train, y_train.values.ravel())

rs.best_params_
rs.best_score_

# get predictions and residuals
pred = rs.predict(X_test)


preddf = pd.DataFrame(pred, columns=['prediction'],
  index=X_test.index).join(X_test).join(y_test, how="left", on=None, validate="many_to_many")

preddf['resid'] = preddf.incomeratio-preddf.prediction


plt.hist(preddf.resid, color="blue", bins=5)
plt.axvline(preddf.resid.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title("Histogram of Residuals for Income Ratio")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.xlim()
plt.show()

plt.scatter(preddf.prediction, preddf.resid, color="blue")
plt.axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.title("Scatterplot of Predictions and Residuals")
plt.xlabel("Predicted Income Ratio")
plt.ylabel("Residuals")
plt.show()

preddf.loc[np.abs(preddf.resid)>=0.12,
  ['incomeratio','prediction','resid',
  'laborforcepartratio', 'humandevratio']].T