In [23]:
# 数据读取及基本处理
import numpy as np
import pandas as pd

#数据分割
from sklearn.model_selection import train_test_split

# 缺省参数的线性回归
from sklearn.linear_model import LinearRegression
# 带有广义交叉项的岭回归
from sklearn.linear_model import  RidgeCV
#Lasso模型
from sklearn.linear_model import LassoCV

# 导入评价指标
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## 1.读取做完特征工程的文件 

In [8]:
# 读取做完特征工程的文件 
df = pd.read_csv('feature_train_day_cnt.csv')

# 显示数据的前五行
df.head()

Unnamed: 0,instant,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,...,weekday_5,weekday_6,temp,atemp,hum,windspeed,holiday,workingday,yr,cnt
0,1,1,0,0,0,1,0,0,0,0,...,0,1,0.35517,0.373517,0.82862,0.284606,0,0,0,985
1,2,1,0,0,0,1,0,0,0,0,...,0,0,0.379232,0.360541,0.715771,0.466215,0,0,0,801
2,3,1,0,0,0,1,0,0,0,0,...,0,0,0.171,0.14483,0.449638,0.46574,0,1,0,1349
3,4,1,0,0,0,1,0,0,0,0,...,0,0,0.17553,0.174649,0.607131,0.284297,0,1,0,1562
4,5,1,0,0,0,1,0,0,0,0,...,0,0,0.20912,0.197158,0.449313,0.339143,0,1,0,1600


## 2.分离测试样本和训练样本(20%测试，80%训练)

In [17]:
# 从原始数据中分离输入特征x和输出y
y=df['cnt']
X=df.drop(['cnt'],axis=1)

# 特征名称，用于后续显示权重系数对应的特征
feature_names= X.columns

#数据分割
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=33,test_size=0.2)
X_train.shape

(584, 34)

## 3.模型训练

### 1）训练最小二乘线性回归模型

In [27]:
#1.使用默认配置初始化学习器实例
lr=LinearRegression()

#2.用训练数据训练模型参数
lr.fit(X_train,y_train)

#3.用训练好的模型对测试集进行预测
y_test_pred_lr=lr.predict(X_test)
y_train_pred_lr=lr.predict(X_train)

# 各特征的权重系数，系数绝对值的大小可视为该特征的重要性
fs=pd.DataFrame({'columns':list(feature_names),'coef':list((lr.coef_.T))})
fs.sort_values(by=['coef'],ascending=False)

Unnamed: 0,columns,coef
33,yr,4550.708897
27,temp,2654.792827
13,mnth_9,1287.99236
28,atemp,995.293778
14,mnth_10,929.887649
17,weathersit_1,914.410909
4,season_4,830.579518
16,mnth_12,586.296405
12,mnth_8,517.803038
18,weathersit_2,409.589079


### 2）训练岭回归模型

In [28]:
# 1.设置超参数（正则参数）范围
alphas=[0.001,0.1,1,10,100]

# 2. 生成一个Ridge实例
ridge=RidgeCV(alphas=alphas,store_cv_values=True)

# 3.模型训练
ridge.fit(X_train, y_train)

# 4.模型预测
y_test_pred_ridge = ridge.predict(X_test)
y_train_pred_ridge = ridge.predict(X_train)

#观察各特征的权重系数
fs_2=pd.DataFrame({'columns':list(feature_names),'coef':list((ridge.coef_.T))})
fs_2.sort_values(by=['coef'],ascending=False)
 

Unnamed: 0,columns,coef
27,temp,1778.493414
28,atemp,1546.374034
33,yr,1504.623924
17,weathersit_1,914.843092
4,season_4,767.205317
13,mnth_9,678.352931
18,weathersit_2,388.865215
9,mnth_5,387.713628
10,mnth_6,369.663154
7,mnth_3,298.55005


### 3）训练Lasso回归模型

In [24]:
#1. 设置超参数（正则参数）范围
alphas = [0.01, 0.1, 1, 10, 100]

#2. 生成一个LassoCV实例
lasso = LassoCV(alphas=alphas)

#3. 模型训练
lasso.fit(X_train, y_train)

#4.用训练好的模型对测试集进行预测
y_test_pred_lasso = lasso.predict(X_test)  # 模型在测试集上的预测
y_train_pred_lasso = lasso.predict(X_train)  # 模型在训练集上的预测 

# 看看各特征的权重系数(coef指系数)，系数的绝对值大小可视为该特征的重要性
fs_3 = pd.DataFrame({'columns':list(feature_names), 'coef':list((lasso.coef_.T))})
fs_3.sort_values(by=['coef'], ascending=False)

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


Unnamed: 0,columns,coef
33,yr,3472.533627
27,temp,2609.9748
28,atemp,1031.60094
13,mnth_9,924.516638
4,season_4,759.982846
17,weathersit_1,499.554848
14,mnth_10,472.969983
12,mnth_8,244.55104
10,mnth_6,99.293831
25,weekday_5,28.802096


## 4.模型评价（评价指标RMSE、r2）

### 1）最小二乘线性回归模型评价

In [29]:
#测试集
print("The RMSE of LinearRegression on test is", np.sqrt(mean_squared_error(y_test, y_test_pred_lr)))
print("The r2 score of LinearRegression on test is", r2_score(y_test, y_test_pred_lr))
# 训练集
print("The RMSE of LinearRegression on train is", np.sqrt(mean_squared_error(y_train, y_train_pred_lr)))
print("The r2 score of LinearRegression on train is", r2_score(y_train, y_train_pred_lr))

The RMSE of LinearRegression on test is 814.474907686365
The r2 score of LinearRegression on test is 0.8279474225980327
The RMSE of LinearRegression on train is 742.7543512758713
The r2 score of LinearRegression on train is 0.8516480637403496


### 2）岭回归模型评价

In [32]:
# 测试集
print("The RMSE of RidgeCV on test is", np.sqrt(mean_squared_error(y_test, y_test_pred_ridge)))
print("The r2 score of RidgeCV on test is", r2_score(y_test, y_test_pred_ridge))
# 训练集
print("The RMSE of RidgeCV on train is", np.sqrt(mean_squared_error(y_train, y_train_pred_ridge)))
print("The r2 score of RidgeCV on train is", r2_score(y_train, y_train_pred_ridge))

The RMSE of RidgeCV on test is 812.0412682483411
The r2 score of RidgeCV on test is 0.8289740678100304
The RMSE of RidgeCV on train is 747.4410131599831
The r2 score of RidgeCV on train is 0.8497700029725637


### 3）Lasso回归模型评价

In [33]:
# 测试集
print("The RMSE of LassoCV on test is", np.sqrt(mean_squared_error(y_test, y_test_pred_lasso)))
print("The r2 score of LassoCV on test is", r2_score(y_test, y_test_pred_lasso))
# 训练集
print("The RMSE of LassoCV on train is", np.sqrt(mean_squared_error(y_train, y_train_pred_lasso)))
print("The r2 score of LassoCV on train is", r2_score(y_train, y_train_pred_lasso))

The RMSE of LassoCV on test is 813.4148635007385
The r2 score of LassoCV on test is 0.8283949861658034
The RMSE of LassoCV on train is 743.1893104872604
The r2 score of LassoCV on train is 0.8514742621740345


## 5.比较说明

1）特征系数   
比较上述三种模型得到的特征系数我们可以看出，岭回归和Lasso回归都能使得线性回归系数收缩，并且在Lasso中有的特征参数系数为0。回归系数都收缩是原因岭回归和Lasso都在最小二乘线性回归的基础上加了正则，限制了特征参数的取值，而Lasso中某些特征的系数为0，是因为对于L1正则，目标函数求的是次梯度，当梯度在次梯度集合内的时候，该维度的特征系数为0.

2）在测试集上的性能   
通过观察模型评价指标可以看出，在训练集上评价最好的是岭回归模型，其次是Lasso模型，最后是最小二乘线性回归。原因是岭回归和Lasso都在最小二乘线性回归模型中加入了正则项，防止了模型过拟合的问题，所以效果要更好些。而在特征分析中，我们看到有很多特征相关性比较大，比如说温度与体感温度，在特征多，且特征间存在共线性关系时使用L2正则效果要更好，所以这这里岭回归模型比Lasso回归又好些。