# Scikit－Learn
Pythonの有名な機械学習ライブラリ

+ 分類
+ クラスタリング
+ 回帰
+ データ分割・正規化・次元削減

# データの読み込み
irisデータセットという「花（アヤメ）」に関する有名なデータがあります

data：ガクの長さ・ガクの幅・花弁の長さ・花弁の幅

target：花（アヤメ）の種類

In [1]:
"""読み込み
"""
from sklearn.datasets import load_iris
iris = load_iris()

In [2]:
"""データの内訳
"""
print(iris)

{'data': array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
     

In [3]:
"""データ内訳
"""
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

# データ分割
訓練データ：モデルを作成するためのデータ

テストデータ：作成したモデルの性能を検証するデータ

過学習：訓練データにのみ当てはまりの良いモデルを作成してしまうこと。訓練データ・テストデータ双方に当てはまりがよくないといいモデルとは言えない

In [4]:
"""データ分割
・説明変数と目的変数を与える
・テストデータを全体の何％にするか決める
・固定にするか否か
"""
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], test_size=0.3,  random_state=0)

# 重回帰分析

今回は説明変数が４つなので、重回帰分析を行う

y=w0\*x0+w1\*x1+w2\*x2+w3\*x3+C

In [5]:
"""基本モデルの作成
"""
from sklearn.linear_model import LinearRegression
clf = LinearRegression()

In [6]:
"""モデルの最適化
"""
clf.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [7]:
"""結果の表示
"""
#回帰係数
print(clf.coef_)
#定数項
print(clf.intercept_)

[-0.17009418 -0.01856621  0.27900206  0.56061274]
0.3501722420686396


In [8]:
"""つまり！！！
"""
y_predict=\
X_train[:,0]*clf.coef_[0]+\
X_train[:,1]*clf.coef_[1]+\
X_train[:,2]*clf.coef_[2]+\
X_train[:,3]*clf.coef_[3]+\
clf.intercept_
print(y_predict)

[ 9.99688875e-01  1.73247571e+00  2.14111635e+00  1.52469092e+00
  1.79144060e+00  1.05601136e+00  1.97607030e+00  1.42560049e+00
  1.04697719e+00  1.60466317e+00  1.49042314e+00  1.67884617e+00
  1.69233790e+00  1.19626863e+00  1.76089764e+00  1.41897171e+00
  3.46907633e-02  1.80157195e+00  1.24812129e+00  1.07700681e+00
  1.21304353e+00  1.32516643e+00  1.74919758e+00 -8.93962609e-02
 -7.39678307e-02  1.75531586e+00  1.16948073e+00 -4.50354031e-02
 -5.83077329e-02  1.32699636e+00  4.08093934e-02  1.72293140e+00
  8.89956594e-01  6.34005980e-02  1.35117105e+00  2.16210022e+00
  1.22739863e+00 -7.53143028e-02  1.91020385e+00  1.99683158e+00
  2.04042143e+00  1.94928240e+00 -2.93338391e-02  2.91209000e-02
  1.72928419e+00  1.91687125e+00  7.29326417e-02  2.02848365e+00
 -3.62624468e-02  1.85045568e+00  1.80286504e+00  1.71711960e-02
 -6.44115915e-02  1.58498786e+00 -2.35031144e-02  2.08844382e-02
  5.78040401e-02  1.21675678e+00  2.06964141e+00  2.02045821e+00
 -7.95643887e-02 -1.02188

In [9]:
"""テストデータへの適用
"""
y_predict=clf.predict(X_test)
print(y_predict)

[ 2.0800217   0.96538248 -0.16059841  1.82145864 -0.03279814  2.29285464
 -0.0343939   1.30521478  1.26561416  1.10541716  1.61159498  1.298586
  1.24465733  1.31690327  1.35492326 -0.10346345  1.37219354  1.26672646
  0.03680825 -0.05379679  1.83399503  1.43837459  0.11282152  0.05064127
  1.59191577 -0.10797474  0.16648933  1.17025174  0.9383186   0.10722461
  1.74762851  1.47239343 -0.0686736   1.63311215  2.00534368  1.28851961
 -0.04742887  1.59936929  1.30364572  1.10409738  1.84963128 -0.03385706
  1.83128696  0.21749054 -0.10640568]


In [10]:
"""モデルの保存
"""
import pickle
pickle.dump(clf, open("test_model.sav", 'wb'))

In [11]:
"""モデルの読み込み
"""
loaded_model = pickle.load(open("test_model.sav", 'rb'))
y_predict = loaded_model.predict(X_test)
print(y_predict)

[ 2.0800217   0.96538248 -0.16059841  1.82145864 -0.03279814  2.29285464
 -0.0343939   1.30521478  1.26561416  1.10541716  1.61159498  1.298586
  1.24465733  1.31690327  1.35492326 -0.10346345  1.37219354  1.26672646
  0.03680825 -0.05379679  1.83399503  1.43837459  0.11282152  0.05064127
  1.59191577 -0.10797474  0.16648933  1.17025174  0.9383186   0.10722461
  1.74762851  1.47239343 -0.0686736   1.63311215  2.00534368  1.28851961
 -0.04742887  1.59936929  1.30364572  1.10409738  1.84963128 -0.03385706
  1.83128696  0.21749054 -0.10640568]


# ロジスティック回帰

0~1までの値に収まるデータに対して行う回帰分析

主に、社会学的分析において、信用リスクなどの場面で用いられる

In [12]:
"""データの読み込み
scikit-learn標準の乳がん発症率のデータを使用する
"""
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [13]:
print(cancer["data"])
print(cancer["target"])

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 

In [14]:
"""データ分割
"""
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cancer['data'],cancer['target'], test_size=0.3,  random_state=0)

In [15]:
"""基本モデル
"""
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [16]:
clf.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
"""結果の表示
"""
#回帰係数
print(clf.coef_)
#誤差
print(clf.intercept_)

[[ 1.70591683  0.10754564  0.08737751 -0.0067119  -0.12453594 -0.32507542
  -0.5032975  -0.26273233 -0.24726195 -0.0209866   0.04410859  0.9619932
   0.08782903 -0.10501675 -0.00833525  0.01108758 -0.03919464 -0.0293312
  -0.03069023  0.00900565  1.42118734 -0.28970194 -0.25563653 -0.01898139
  -0.21425691 -0.99864678 -1.48179894 -0.54194712 -0.60128666 -0.10583353]]
[0.3474268]


In [19]:
"""予測値
"""
import pandas as pd

y_predict=clf.predict(X_test)

result=pd.DataFrame({"predict":y_predict,"answer":y_test})

print(result)

     predict  answer
0          0       0
1          1       1
2          1       1
3          1       1
4          1       1
5          1       1
6          1       1
7          1       1
8          1       1
9          1       1
10         1       1
11         1       1
12         1       1
13         0       1
14         1       1
15         0       0
16         1       1
17         0       0
18         0       0
19         0       0
20         0       0
21         0       0
22         1       1
23         1       1
24         0       0
25         1       1
26         1       1
27         0       0
28         1       1
29         0       0
..       ...     ...
141        1       1
142        0       0
143        1       1
144        1       1
145        0       0
146        1       1
147        0       0
148        1       1
149        1       1
150        1       1
151        1       1
152        1       1
153        1       1
154        1       1
155        0       0
156        1 

# sklearnを使った回帰分析の方法
+ データを用意し、分割

    X_train,y_train,X_test,y_test

+ モデルを作成

    clf

+ モデルを訓練データで最適化

    clf.fit(X_train,y_train)

+ 最適化したモデルの性能検証

    clf.predict(X_test)