# 数据预处理

In [10]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris() # 导入数据集
X = iris.data # 获得其特征向量
y = iris.target # 获得样本label

# 分割数据集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

print(X_train[:5], '\n', y_train[:5,], '\n','测试集:','\n', X_test[:3,:], '\n', y_test[:3] )

[[7.7 2.6 6.9 2.3]
 [5.1 3.8 1.5 0.3]
 [4.6 3.2 1.4 0.2]
 [6.1 3.  4.9 1.8]
 [6.3 2.5 4.9 1.5]] 
 [2 0 0 2 1] 
 测试集: 
 [[6.1 3.  4.6 1.4]
 [6.4 2.8 5.6 2.2]
 [6.9 3.2 5.7 2.3]] 
 [1 2 2]


In [2]:
from sklearn import preprocessing
# 1. 基于mean和std的标准化
scaler = preprocessing.StandardScaler().fit(X_train)
train_data = scaler.transform(X_train)
test_data = scaler.transform(X_test)

# 模型拟合和预测

In [None]:
def fit(X_train, y_train, X_test, model, y_test = None):
    # 拟合模型
    model.fit(X_train, y_train)
    # 模型预测
    print(model.predict(X_test))
    
    # 获得这个模型的参数
    print(model.get_params())
    # 为模型进行打分
    print('model得分(训练集）：',model.score(X_train, y_train)) # 线性回归：R square； 分类问题： acc
    if y_test is not None:
        print('model得分（测试集）：',model.score(X_test, y_test)) # 线性回归：R square； 分类问题： acc
        # 显示综合指标
        anwser=model.predict(X_test)
        from sklearn.metrics import classification_report,accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score
        print(classification_report(y_test,anwser))

In [None]:
# 拟合模型
model.fit(X_train, y_train)
# 模型预测
print(model.predict(X_test))

# 获得这个模型的参数
print(model.get_params())
# 为模型进行打分
print('model得分：',model.score(X_train, y_train)) # 线性回归：R square； 分类问题： acc

# 正则化线性回归模型

In [6]:
from sklearn.linear_model import Lasso

model = Lasso(alpha=.3)
names = iris["feature_names"]

fit(X_train, y_train, X_test, model)

[1.32220517 1.39342326 0.14710657 1.25098707 0.25393372 1.50025041
 0.21832467 1.28659612 1.60707756 1.6426866  1.50025041 0.21832467
 1.25098707 1.21537802 0.14710657 0.21832467 1.6426866  0.18271562
 1.28659612 1.42903231 0.75246039 1.28659612 1.74951375 1.28659612
 0.28954276 0.14710657 1.32220517 1.82073185 1.35781422 0.28954276
 1.46464136 1.82073185 0.21832467 1.46464136 0.14710657 1.42903231
 0.93050564 1.42903231 1.67829565 1.35781422 1.00172373 1.35781422
 0.18271562 1.53585946 1.32220517]
{'alpha': 0.3, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'normalize': False, 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False}
model得分(训练集）： 0.8850119309714172


L1正则化将系数w的l1范数作为惩罚项加到损失函数上，由于正则项非零，这就迫使那些弱的特征所对应的系数变成0。

因此L1正则化往往会使学到的模型很稀疏（系数w经常为0），这个特性使得L1正则化成为一种很好的特征选择方法。

In [7]:
# 另一个例子
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
  
boston = load_boston()
scaler = StandardScaler()
X = scaler.fit_transform(boston["data"])
Y = boston["target"]
names = boston["feature_names"]

alpha = 0.3
lasso = Lasso(alpha=alpha)
lasso.fit(X, Y)

def pretty_print_linear(coefs, names = None, sort = False):  
    import pandas as pd
    temp = pd.DataFrame(coefs, names)
    return temp

print("Lasso model's ahlpha: ",alpha , pretty_print_linear(lasso.coef_, names))

Lasso model's ahlpha:  0.3                 0
CRIM    -0.242279
ZN       0.081819
INDUS   -0.000000
CHAS     0.539872
NOX     -0.698913
RM       2.993230
AGE     -0.000000
DIS     -1.080913
RAD      0.000000
TAX     -0.000000
PTRATIO -1.755612
B        0.628315
LSTAT   -3.704633


L2正则化将系数向量的L2范数添加到了损失函数中。由于L2惩罚项中系数是二次方的，这使得L2和L1有着诸多差异，最明显的一点就是，L2正则化会让系数的取值变得平均。对于关联特征，这意味着他们能够获得更相近的对应系数。还是以Y=X1+X2为例，假设X1和X2具有很强的关联，如果用L1正则化，不论学到的模型是Y=X1+X2还是Y=2X1，惩罚都是一样的，都是2alpha。但是对于L2来说，第一个模型的惩罚项是2 alpha，但第二个模型的是4*alpha。可以看出，系数之和为常数时，各系数相等时惩罚是最小的，所以才有了L2会让各个系数趋于相同的特点。

In [8]:
# L2正则化/Ridge regression
from sklearn.linear_model import Ridge
model = Ridge(alpha=10)

fit(X_train, y_train, X_test, model)

[ 1.31625124  1.4098341  -0.02024382  1.28648666  0.15397408  1.58446362
  0.01293899  1.38232621  1.78571779  1.82478133  1.81087885 -0.00749258
  1.27536326  1.22765446 -0.02794898 -0.05291986  1.74573899 -0.02454107
  1.35755584  1.57061192  0.79311531  1.35640992  1.77143807  1.33054549
  0.16533844 -0.02978374  1.37983778  1.88318019  1.38709508  0.10220898
  1.5520048   2.09549136  0.01089843  1.5476724  -0.06653981  1.45412468
  0.90897903  1.56038317  1.90312452  1.35442013  0.9871061   1.39369979
  0.01677915  1.78683887  1.3500721 ]
{'alpha': 10, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': False, 'random_state': None, 'solver': 'auto', 'tol': 0.001}
model得分(训练集）： 0.940143000790545


In [11]:
# 另一个例子
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
size = 100
#We run the method 10 times with different random seeds
for i in range(2):
    print("Random seed %s" % i)
    np.random.seed(seed=i)
    X_seed = np.random.normal(0, 1, size)
    X1 = X_seed + np.random.normal(0, .1, size)
    X2 = X_seed + np.random.normal(0, .1, size)
    X3 = X_seed + np.random.normal(0, .1, size)
    Y = X1 + X2 + X3 + np.random.normal(0, 1, size)
    X = np.array([X1, X2, X3]).T
    
    lr = LinearRegression()
    lr.fit(X,Y)
    print("Linear model:", pretty_print_linear(lr.coef_))
    ridge = Ridge(alpha=10)
    ridge.fit(X,Y)
    print("Ridge model:", pretty_print_linear(ridge.coef_))
    print

Random seed 0
Linear model:           0
0  0.728440
1  2.309260
2 -0.082192
Ridge model:           0
0  0.938321
1  1.058873
2  0.876526
Random seed 1
Linear model:           0
0  1.151816
1  2.365799
2 -0.599009
Ridge model:           0
0  0.984096
1  1.067927
2  0.758554
