In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import RFE
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
import sklearn.metrics as skmet
import matplotlib.pyplot as plt
import os
import sys

# 线性回归

Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

fftaxrate14 = pd.read_csv("data/fossilfueltaxrate14.csv")
fftaxrate14.set_index('countrycode', inplace=True)
fftaxrate14.info()

# setup the features and target
num_cols = ['fuel_income_dependence','national_income_per_cap',
  'VAT_Rate',  'gov_debt_per_gdp','polity','goveffect',
  'democracy_index']
dummy_cols = ['democracy_polity','autocracy_polity','democracy',
  'nat_oil_comp','nat_oil_comp_state']
spec_cols = ['motorization_rate']

# generate some summary statistics
fftaxrate14[['gas_tax_imp'] + num_cols + spec_cols].\
  agg(['count','min','median','max']).T
fftaxrate14[dummy_cols].apply(pd.value_counts, normalize=True).T

target = fftaxrate14[['gas_tax_imp']]
features = fftaxrate14[num_cols + dummy_cols + spec_cols]

X_train, X_test, y_train, y_test =  \
  train_test_split(features,\
  target, test_size=0.2, random_state=0)
      
# setup pipelines for column transformation
standtrans = make_pipeline(OutlierTrans(2), SimpleImputer(strategy="median"),
  StandardScaler())
cattrans = make_pipeline(SimpleImputer(strategy="most_frequent"))
spectrans = make_pipeline(OutlierTrans(2), StandardScaler())
coltrans = ColumnTransformer(
  transformers=[
    ("stand", standtrans, num_cols),
    ("cat", cattrans, dummy_cols),
    ("spec", spectrans, spec_cols)
  ]
)

# add feature selection and a linear model to the pipeline and look at the parameter estimates
lr = LinearRegression()

rfe = RFE(estimator=lr, n_features_to_select=7)

pipe1 = make_pipeline(coltrans, KNNImputer(n_neighbors=5), rfe, lr)

ttr=TransformedTargetRegressor(regressor=pipe1,transformer=StandardScaler())

ttr.fit(X_train, y_train)

selcols = X_train.columns[ttr.regressor_.named_steps['rfe'].support_]
coefs = ttr.regressor_.named_steps['linearregression'].coef_
np.column_stack((coefs.ravel(),selcols))

# get predictions and residuals
pred = ttr.predict(X_test)

preddf = pd.DataFrame(pred, columns=['prediction'],
  index=X_test.index).join(X_test).join(y_test)

preddf['resid'] = preddf.gas_tax_imp-preddf.prediction

preddf.resid.agg(['mean','median','skew','kurtosis'])

# generate summary model evaluation statistics
print("Mean Absolute Error: %.2f, R-squared: %.2f" % 
  (skmet.mean_absolute_error(y_test, pred),
  skmet.r2_score(y_test, pred)))


# plot the residuals
plt.hist(preddf.resid, color="blue", bins=np.arange(-0.5,1.0,0.25))
plt.axvline(preddf.resid.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title("Histogram of Residuals for Gax Tax Model")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.xlim()
plt.show()

# plot predictions against the residuals
plt.scatter(preddf.prediction, preddf.resid, color="blue")
plt.axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.title("Scatterplot of Predictions and Residuals")
plt.xlabel("Predicted Gax Tax")
plt.ylabel("Residuals")
plt.show()


# do kfold cross validation
X_train, X_test, y_train, y_test =  \
  train_test_split(features,\
  target, test_size=0.1, random_state=22)

kf = KFold(n_splits=3, shuffle=True, random_state=0)

ttr.fit(X_train, y_train)

scores = cross_validate(ttr, X=X_train, y=y_train,
  cv=kf, scoring=('r2', 'neg_mean_absolute_error'), n_jobs=1)

scores

print("Mean Absolute Error: %.2f, R-squared: %.2f" % 
  (scores['test_neg_mean_absolute_error'].mean(),
  scores['test_r2'].mean()))

# 拉索回归

lasso regression

In [None]:
from sklearn.linear_model import Lasso

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans


pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

fftaxrate14 = pd.read_csv("data/fossilfueltaxrate14.csv")
fftaxrate14.set_index('countrycode', inplace=True)

# setup the features and target
num_cols = ['fuel_income_dependence','national_income_per_cap',
  'VAT_Rate',  'gov_debt_per_gdp','polity','goveffect',
  'democracy_index']
dummy_cols = ['democracy_polity','autocracy_polity','democracy',
  'nat_oil_comp','nat_oil_comp_state']
spec_cols = ['motorization_rate']

target = fftaxrate14[['gas_tax_imp']]
features = fftaxrate14[num_cols + dummy_cols + spec_cols]

X_train, X_test, y_train, y_test =  \
  train_test_split(features,\
  target, test_size=0.2, random_state=0)
      
# setup pipelines for column transformation
standtrans = make_pipeline(OutlierTrans(2), SimpleImputer(strategy="median"),
  StandardScaler())
cattrans = make_pipeline(SimpleImputer(strategy="most_frequent"))
spectrans = make_pipeline(OutlierTrans(2), StandardScaler())
coltrans = ColumnTransformer(
  transformers=[
    ("stand", standtrans, num_cols),
    ("cat", cattrans, dummy_cols),
    ("spec", spectrans, spec_cols)
  ]
)

# add feature selection and a linear model to the pipeline and look at the parameter estimates
lasso = Lasso(alpha=0.1,fit_intercept=False)

pipe1 = make_pipeline(coltrans, KNNImputer(n_neighbors=5), lasso)

ttr=TransformedTargetRegressor(regressor=pipe1,transformer=StandardScaler())

ttr.fit(X_train, y_train)

coefs = ttr.regressor_['lasso'].coef_
np.column_stack((coefs.ravel(), num_cols + dummy_cols + spec_cols))

# get predictions and residuals
pred = ttr.predict(X_test)

preddf = pd.DataFrame(pred, columns=['prediction'],
  index=X_test.index).join(X_test, how="left", on=None, validate="many_to_many").join(y_test)

preddf['resid'] = preddf.gas_tax_imp-preddf.prediction

preddf.resid.agg(['mean','median','skew','kurtosis'])

# generate summary model evaluation statistics
print("Mean Absolute Error: %.2f, R-squared: %.2f" % 
  (skmet.mean_absolute_error(y_test, pred),
  skmet.r2_score(y_test, pred)))


# plot the residuals
plt.hist(preddf.resid, color="blue", bins=np.arange(-0.5,1.0,0.25))
plt.axvline(preddf.resid.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title("Histogram of Residuals for Gax Tax Model")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

# plot predictions against the residuals
plt.scatter(preddf.prediction, preddf.resid, color="blue")
plt.axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.title("Scatterplot of Predictions and Residuals")
plt.xlabel("Predicted Gax Tax")
plt.ylabel("Residuals")
plt.show()

# do kfold cross validation
X_train, X_test, y_train, y_test =  \
  train_test_split(features,\
  target, test_size=0.1, random_state=22)

kf = KFold(n_splits=4, shuffle=True, random_state=0)

scores = cross_validate(ttr, X=X_train, y=y_train,
  cv=kf, scoring=('r2', 'neg_mean_absolute_error'), n_jobs=1)


print("Mean Absolute Error: %.2f, R-squared: %.2f" % 
  (scores['test_neg_mean_absolute_error'].mean(),
  scores['test_r2'].mean()))


# do a grid search to find the best value of alpha
lasso = Lasso()

pipe1 = make_pipeline(coltrans, KNNImputer(n_neighbors=5), lasso)

lasso_params = {'regressor__lasso__alpha': np.arange(0.05, 1, 0.05)}

gs = GridSearchCV(ttr,param_grid=lasso_params, cv=5)
gs.fit(X_train, y_train)

gs.best_params_
gs.best_score_

# 非线性回归

nonlinear regression

In [None]:

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans

pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.2f}'.format

# load the land temperatures data
landtemps = pd.read_csv("data/landtempsb2019avgs.csv")
landtemps.set_index('locationid', inplace=True)

feature_cols = ['latabs','elevation']

landtemps[['avgtemp'] + feature_cols].\
  agg(['count','min','median','max']).T

# create training and testing DataFrames
X_train, X_test, y_train, y_test =  \
  train_test_split(landtemps[feature_cols],\
  landtemps[['avgtemp']], test_size=0.1, random_state=0)

# do a linear reqression and cross validate
lr = LinearRegression()

knnimp = KNNImputer(n_neighbors=45)

pipe1 = make_pipeline(OutlierTrans(3),knnimp,StandardScaler(), lr)

ttr=TransformedTargetRegressor(regressor=pipe1,transformer=StandardScaler())

kf = KFold(n_splits=10, shuffle=True, random_state=0)
      
scores = cross_validate(ttr, X=X_train, y=y_train,
  cv=kf, scoring=('r2', 'neg_mean_absolute_error'), n_jobs=1)

scores['test_r2'].mean(), scores['test_neg_mean_absolute_error'].mean()

# get predictions and residuals
ttr.fit(X_train, y_train)

pred = ttr.predict(X_test)

preddf = pd.DataFrame(pred, columns=['prediction'],
  index=X_test.index).join(X_test).join(y_test, how="left", on=None, validate="many_to_many")

preddf['resid'] = preddf.avgtemp-preddf.prediction

preddf.resid.agg(['mean','median','skew','kurtosis'])

# plot the residuals
plt.hist(preddf.resid, color="blue")
plt.axvline(preddf.resid.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title("Histogram of Residuals for Linear Model of Temperature")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

# plot predictions against the residuals
plt.scatter(preddf.prediction, preddf.resid, color="blue")
plt.axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.title("Scatterplot of Predictions and Residuals")
plt.xlabel("Predicted Temperature")
plt.ylabel("Residuals")
plt.xlim(-20,40)
plt.ylim(-27,10)
plt.show()


# do a polynomial transformation
polytrans = PolynomialFeatures(degree=4, include_bias=False)
polytrans.fit(X_train.dropna())
featurenames = polytrans.get_feature_names(feature_cols)
featurenames

# get predictions and residuals
pipe2 = make_pipeline(OutlierTrans(3), knnimp,
  polytrans, StandardScaler(), lr)

ttr2 = TransformedTargetRegressor(regressor=pipe2,\
  transformer=StandardScaler())

ttr2.fit(X_train, y_train)

pred = ttr2.predict(X_test)

preddf = pd.DataFrame(pred, columns=['prediction'],
  index=X_test.index).join(X_test).join(y_test, how="left", on=None, validate="many_to_many")

preddf['resid'] = preddf.avgtemp-preddf.prediction

preddf.resid.agg(['mean','median','skew','kurtosis'])

# plot the residuals
plt.hist(preddf.resid, color="blue")
plt.axvline(preddf.resid.mean(), color='red', linestyle='dashed', linewidth=1)
plt.title("Histogram of Residuals for Temperature Model")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

# plot predictions against the residuals
plt.scatter(preddf.prediction, preddf.resid, color="blue")
plt.axhline(0, color='red', linestyle='dashed', linewidth=1)
plt.title("Scatterplot of Predictions and Residuals")
plt.xlabel("Predicted Temperature")
plt.ylabel("Residuals")
plt.xlim(-20,40)
plt.ylim(-27,10)
plt.show()

scores = cross_validate(ttr2, X=X_train, y=y_train,
  cv=kf, scoring=('r2', 'neg_mean_absolute_error'), n_jobs=1)

scores['test_r2'].mean(), scores['test_neg_mean_absolute_error'].mean()

# 梯度下降

gradient descent

In [4]:
from sklearn.linear_model import SGDRegressor

sys.path.append(os.getcwd() + "/helperfunctions")
from preprocfunc import OutlierTrans

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

landtemps = pd.read_csv("data/landtempsb2019avgs.csv")
landtemps.set_index('locationid', inplace=True)

feature_cols = ['latabs','elevation']

# create training and testing DataFrames
X_train, X_test, y_train, y_test =  \
  train_test_split(landtemps[feature_cols],\
  landtemps[['avgtemp']], test_size=0.1, random_state=0)

    
knnimp = KNNImputer(n_neighbors=45)

sgdr = SGDRegressor()

pipe1 = make_pipeline(OutlierTrans(3),knnimp,StandardScaler(), sgdr)

ttr=TransformedTargetRegressor(regressor=pipe1,transformer=StandardScaler())

sgdr_params = {
 'regressor__sgdregressor__alpha': 10.0 ** -np.arange(1, 7),
 'regressor__sgdregressor__loss': ['huber','epsilon_insensitive'],
 'regressor__sgdregressor__penalty': ['l2', 'l1', 'elasticnet'],
 'regressor__sgdregressor__epsilon': np.arange(0.1, 1.6, 0.1)
}

gs = GridSearchCV(ttr,param_grid=sgdr_params, cv=5, scoring="r2")
gs.fit(X_train, y_train)

gs.best_params_
gs.best_score_

results = \
  pd.DataFrame(gs.cv_results_['mean_test_score'], \
    columns=['meanscore']).\
  join(pd.DataFrame(gs.cv_results_['params']), how="left", on=None, validate="many_to_many").\
  sort_values(['meanscore'], ascending=False)

results.head(3).T

Unnamed: 0,434,422,445
meanscore,0.79,0.79,0.79
regressor__sgdregressor__alpha,0.00,0.00,0.00
regressor__sgdregressor__epsilon,1.30,1.10,1.50
regressor__sgdregressor__loss,huber,huber,huber
regressor__sgdregressor__penalty,elasticnet,elasticnet,l1
