# 数据预处理

In [10]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris() # 导入数据集
X = iris.data # 获得其特征向量
y = iris.target # 获得样本label

# 分割数据集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

print(X_train[:5], '\n', y_train[:5,], '\n','测试集:','\n', X_test[:3,:], '\n', y_test[:3] )

[[7.7 2.6 6.9 2.3]
 [5.1 3.8 1.5 0.3]
 [4.6 3.2 1.4 0.2]
 [6.1 3.  4.9 1.8]
 [6.3 2.5 4.9 1.5]] 
 [2 0 0 2 1] 
 测试集: 
 [[6.1 3.  4.6 1.4]
 [6.4 2.8 5.6 2.2]
 [6.9 3.2 5.7 2.3]] 
 [1 2 2]


In [2]:
from sklearn import preprocessing
# 1. 基于mean和std的标准化
scaler = preprocessing.StandardScaler().fit(X_train)
train_data = scaler.transform(X_train)
test_data = scaler.transform(X_test)

# 模型拟合和预测

In [None]:
def fit(X_train, y_train, X_test, model, y_test = None):
    # 拟合模型
    model.fit(X_train, y_train)
    # 模型预测
    print(model.predict(X_test))
    
    # 获得这个模型的参数
    print(model.get_params())
    # 为模型进行打分
    print('model得分(训练集）：',model.score(X_train, y_train)) # 线性回归：R square； 分类问题： acc
    if y_test is not None:
        print('model得分（测试集）：',model.score(X_test, y_test)) # 线性回归：R square； 分类问题： acc
        # 显示综合指标
        anwser=model.predict(X_test)
        from sklearn.metrics import classification_report,accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score
        print(classification_report(y_test,anwser))

In [None]:
# 拟合模型
model.fit(X_train, y_train)
# 模型预测
print(model.predict(X_test))

# 获得这个模型的参数
print(model.get_params())
# 为模型进行打分
print('model得分：',model.score(X_train, y_train)) # 线性回归：R square； 分类问题： acc

# 线性回归模型

In [5]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True, normalize=False, 
    copy_X=True, n_jobs=1)
"""
参数
---
    fit_intercept：是否计算截距。False-模型没有截距
    normalize： 当fit_intercept设置为False时，该参数将被忽略。 如果为真，则回归前的回归系数X将通过减去平均值并除以l2-范数而归一化。
     n_jobs：指定线程数
"""
fit(X_train, y_train, X_test, model)

[ 1.21051493  1.30360611 -0.00628713  1.22231359  0.22868601  1.56726256
 -0.03428546  1.38096044  1.84255115  1.87825047  2.07150535 -0.07527829
  1.22295999  1.17923241 -0.0218404  -0.14788095  1.71994611 -0.08054844
  1.40034191  1.62002308  0.88542604  1.39352526  1.60216416  1.31396282
  0.1904589  -0.01620439  1.34712674  1.7592645   1.33755197  0.0731164
  1.53088587  2.26297305 -0.0429911   1.50854692 -0.12684239  1.36353337
  0.87693494  1.62255847  2.00649912  1.24432525  0.94833357  1.33687459
  0.02070654  1.94379854  1.3105583 ]
{'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'normalize': False}
model得分(训练集）： 0.9484664413970854


# 多项式回归

In [12]:
# 从sklearn.preproessing中导入多项式特征产生器
from sklearn.preprocessing import PolynomialFeatures

# 使用PolynominalFeatures(degree=2)映射出2次多项式特征，存储在变量X_train_poly2中。
poly2 = PolynomialFeatures(degree=2)

# 若训练集X_train 中不止一个因素，那么出现交叉效应，与方差分析相似
X_train_poly2 = poly2.fit_transform(X_train)
X_test_poly2 = poly2.fit_transform(X_test)
# 以线性回归器为基础，初始化回归模型。尽管特征的维度有提升，但是模型基础仍然是线性模型。
model = LinearRegression()

fit(X_train, y_train, X_test, model)

[ 1.29807778  1.93899884  1.9526176   1.16859315 -0.03383056 -0.0222433
  1.63357273  0.88206733  1.56834976  1.21124951  1.55543048  1.99331374
 -0.11684131  2.07528894  0.94513384  1.78076954  1.900731   -0.04085768
  1.46582458  1.33114105 -0.01106339  1.96880017 -0.13455781  1.03801088
 -0.05586566  0.81948431  1.16499026  1.38115494  1.16860235 -0.09469044
 -0.04960794  1.72457002 -0.13462205  1.02544175  1.83487901  1.71332019
  1.27133296  1.12567279  1.29995402  2.04771399  0.01026303  0.15092542
  1.28002033  0.00560836  1.53903896]
{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}
model得分(训练集）： 0.9306955508703275


# 逻辑回归模型

In [13]:
from sklearn.linear_model import LogisticRegression
# 定义逻辑回归模型
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, 
    fit_intercept=True, intercept_scaling=1, class_weight=None, 
    random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', 
    verbose=0, warm_start=False, n_jobs=1)

"""参数
---
    penalty：使用指定正则化项（默认：l2）
    dual: n_samples > n_features取False（默认）
    C：正则化强度的反，值越小正则化强度越大
    n_jobs: 指定线程数
    random_state：随机数生成器
    fit_intercept: 是否需要常量
"""

fit(X_train, y_train, X_test, model)

[1 2 2 1 0 0 2 1 2 1 2 2 0 2 1 2 2 0 2 1 0 2 0 1 0 1 1 2 1 0 0 2 0 1 2 2 1
 1 1 2 0 0 1 0 2]
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': 1, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
model得分(训练集）： 0.9619047619047619


# 决策树DT

In [15]:
# 决策树DT
from sklearn import tree

model = tree.DecisionTreeClassifier(criterion='gini', max_depth=None, 
    min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
    max_features=None, random_state=None, max_leaf_nodes=None, 
    min_impurity_decrease=0.0, min_impurity_split=None,
     class_weight=None)

"""参数
---
    criterion ：特征选择准则gini/entropy
    max_depth：树的最大深度，None-尽量下分
    min_samples_split：分裂内部节点，所需要的最小样本树
    min_samples_leaf：叶子节点所需要的最小样本数
    max_features: 寻找最优分割点时的最大特征数
    max_leaf_nodes：优先增长到最大叶子节点数
    min_impurity_decrease：如果这种分离导致杂质的减少大于或等于这个值，则节点将被拆分。
"""

fit(X_train, y_train, X_test, model)

[1 2 2 1 0 0 2 1 1 1 2 2 0 2 1 2 2 0 2 1 0 2 0 1 0 1 1 1 1 0 0 2 0 1 2 2 1
 1 1 2 0 0 1 0 2]
{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
model得分(训练集）： 1.0


In [16]:
# 回归树DTR
from sklearn import tree
model = tree.DecisionTreeRegressor()

fit(X_train, y_train, X_test, model)

[1. 2. 2. 1. 0. 0. 2. 1. 1. 1. 2. 2. 0. 2. 1. 2. 2. 0. 2. 1. 0. 2. 0. 1.
 0. 1. 1. 1. 1. 0. 0. 2. 0. 1. 2. 2. 1. 1. 1. 2. 0. 0. 1. 0. 2.]
{'criterion': 'mse', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
model得分(训练集）： 1.0


# 多层感知机分类

In [19]:
from sklearn.neural_network import MLPClassifier
# 定义多层感知机分类算法
model = MLPClassifier(activation='relu', solver='adam', alpha=0.0001)
"""参数
---
    hidden_layer_sizes: 元祖
    activation：激活函数
    solver ：优化算法{‘lbfgs’, ‘sgd’, ‘adam’}
    alpha：L2惩罚(正则化项)参数。
"""
fit(X_train, y_train, X_test, model) # 不收敛警告

[1 2 2 1 0 0 2 1 2 1 2 2 0 2 1 2 2 0 2 1 0 2 0 1 0 1 1 1 1 0 0 2 0 1 2 2 1
 1 1 2 0 0 1 0 2]
{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
model得分(训练集）： 0.9809523809523809




# 保存模型

In [20]:
import pickle

# 保存模型
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)

# 读取模型
with open('model.pickle', 'rb') as f:
    model = pickle.load(f)
model.predict(X_test)

array([1, 2, 2, 1, 0, 0, 2, 1, 2, 1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 1, 0, 2,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 2, 0, 1, 2, 2, 1, 1, 1, 2, 0, 0, 1, 0,
       2])

In [21]:
from sklearn.externals import joblib

# 保存模型
joblib.dump(model, 'model.pickle')

#载入模型
model = joblib.load('model.pickle')
"""参数
---
    这个版本太新了，要退回旧版本。等以后更新处理
"""



'参数\n---\n    这个版本太新了，要退回旧版本。等以后更新处理\n'