In [1]:
# データ加工・処理・分析ライブラリ
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; sns.set()
%matplotlib inline

# 機械学習ライブラリ
import sklearn
from sklearn.model_selection import train_test_split

# 小数第3位まで表示
%precision 3

'%.3f'

In [2]:
# pandas の設定を調整 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

# matplotlib の設定を調整   
plt.rcParams["figure.figsize"] = [18, 10]
plt.rcParams['font.size'] = 16 #フォントサイズを設定 default : 12
plt.rcParams['xtick.labelsize'] = 14 # 横軸のフォントサイズ
plt.rcParams['ytick.labelsize'] = 14

In [3]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
# フラグ化
iris['fin_flg'] = iris['species'].map(lambda x: 0 if x=='setosa' else 1 if x=='versicolor' else 2)

In [5]:
# ロジスティック回帰
from sklearn.linear_model import LogisticRegression
X = iris[['sepal_length','sepal_width','petal_length','petal_width']]
y = iris['fin_flg']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.5)
model = LogisticRegression()
model.fit(X_train,y_train)
print('正解率(train):{:.3f}' . format(model.score(X_train,y_train)))
print('正解率(test):{:.3f}' . format(model.score(X_test,y_test)))

正解率(train):0.973
正解率(test):0.933


In [6]:
# スケーリングによる予測精度の向上を試す
from sklearn.preprocessing import StandardScaler
X = iris[['sepal_length','sepal_width','petal_length','petal_width']]
y = iris['fin_flg']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.5)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model = LogisticRegression()
model.fit(X_train_std,y_train)
print('正解率(train):{:.3f}' . format(model.score(X_train_std,y_train)))
print('正解率(test):{:.3f}' . format(model.score(X_test_std,y_test)))

正解率(train):0.973
正解率(test):0.920


In [7]:
# K分割交差検証を試す
from sklearn.model_selection import cross_val_score
X = iris[['sepal_length','sepal_width','petal_length','petal_width']]
y = iris['fin_flg']

sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

model = LogisticRegression(random_state=0)
scores = cross_val_score(model,X_std,y,cv=5)

print('Cross validation scores: {}' . format(scores))
print('Cross validation scores: {:.3f}+-{:.3f}' . format(scores.mean(),scores.std()))

Cross validation scores: [0.967 1.    0.933 0.9   1.   ]
Cross validation scores: 0.960+-0.039
