#機械学習（分類）

###各種インポート

In [12]:
!pip install japanize_matplotlib
%matplotlib inline

import matplotlib.pyplot as plt
import japanize_matplotlib
import pandas as pd
from sklearn.covariance import MinCovDet as MCD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler as SS

from sklearn.linear_model import SGDClassifier as SGDC
from sklearn.svm import LinearSVC as LSVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC

import warnings
import pickle

#警告文の非表示
warnings.simplefilter("ignore")



##関数の定義

###外れ値の除外

In [13]:
def outliers(df):
    df2 = df
    #種類が少ない列は除外
    for name in df2.columns:
        if len(df2[name].unique()) < 5:
            df2 = df2.drop(name,axis=1)
    target_col = df2.select_dtypes("object").columns
 
    #除外対象がない場合終了
    if len(df2.columns) == 0:
        print("外れ値除外対象なし")
        return df
    
    #箱ひげ図の描画
    for plt_x in df2.columns:
        plt.boxplot(df2[plt_x])
        plt.title(plt_x)
        plt.show()

    #マハラノビス距離で外れ値の除外
    mcd = MCD(random_state= 0).fit(df2)
    distance = pd.Series(mcd.mahalanobis(df2),index=df2.index)
    tmp = distance.describe()
    iqr = tmp["75%"] - tmp["25%"]
    upper = 1.5 * iqr + tmp["75%"]
    lower = tmp["25%"] - 1.5 * iqr
    outliner = distance[ (distance > upper) | (distance < lower) ]
    df = df.drop(outliner.index,axis = 0)
    
    return df

###標準化

In [14]:
def scaler(x,t):
    #訓練データと検証データに分割
    x_train,x_val,y_train,y_val = train_test_split(x,t,test_size = 0.2,random_state=0)

    #標準化
    sc_x_model = SS().fit(x_train)
    sc_x_train = sc_x_model.transform(x_train)
    sc_x_val = sc_x_model.transform(x_val)
    
    return sc_x_model,sc_x_train,sc_x_val,y_train,y_val

###SGDClassfierモデル

In [15]:
def sgdc_model(sc_x_train,sc_x_val,y_train,y_val):
    p_model = SGDC(loss="hinge").fit(sc_x_train,y_train)
    p_train_score = p_model.score(sc_x_train,y_train)
    p_val_score = p_model.score(sc_x_val,y_val)

    return p_model,p_train_score,p_val_score

###LinearSVCモデル

In [16]:
def lsvc_model(sc_x_train,sc_x_val,y_train,y_val):
    p_score = -1
    count = 0

    #パラメータ調整
    for i in range(1,10001):
        p_model = LSVC(loss="hinge", C=i/100, class_weight="balanced", random_state=0).fit(sc_x_train,y_train)
        p_train_score = p_model.score(sc_x_train,y_train)
        p_val_score = p_model.score(sc_x_val,y_val)
        
        if p_val_score > p_score:
            #決定係数が高ければ更新
            p_score = p_val_score
            count = 0
        else:
            count += 1
            if count > 5:
                #5回続けて決定係数が下がった場合、変数を戻しbreak
                i -= 5
                p_model = LSVC(loss="hinge", C=i/100, class_weight="balanced", random_state=0).fit(sc_x_train,y_train)
                p_train_score = p_model.score(sc_x_train,y_train)
                p_val_score = p_model.score(sc_x_val,y_val)
                break

    return p_model,p_train_score,p_val_score

###KNeighborsClassifierモデル

In [17]:
def knc_model(sc_x_train,sc_x_val,y_train,y_val):
    p_score = -1
    i_count = 0

    #パラメータ調整
    for i in range(1,10001):
        p_model = KNC(i, weights="distance").fit(sc_x_train,y_train)
        p_train_score = p_model.score(sc_x_train,y_train)
        p_val_score = p_model.score(sc_x_val,y_val)

        if p_val_score > p_score:
            #決定係数が高ければ更新
            p_score = p_val_score
            count = 0
        else:
            count += 1
            if count > 5:
                #5回続けて決定係数が下がった場合、変数を戻しbreak
                i -= 5
                p_model = KNC(i, weights="distance").fit(sc_x_train,y_train)
                p_train_score = p_model.score(sc_x_train,y_train)
                p_val_score = p_model.score(sc_x_val,y_val)
                break

    return p_model,p_train_score,p_val_score

###SVCモデル

In [18]:
def svc_model(sc_x_train,sc_x_val,y_train,y_val):
    p_score = -1
    i_count = 0

    #パラメータ調整
    for i in range(1,10001):
        j_count = 0
        for j in range(1,10001):
            p_model = SVC(kernel="rbf", gamma=i/100 , C=j/100, class_weight="balanced", random_state=0).fit(sc_x_train,y_train)
            p_train_score = p_model.score(sc_x_train,y_train)
            p_val_score = p_model.score(sc_x_val,y_val)

            if p_val_score > p_score:
                #決定係数が高ければ更新
                p_score = p_val_score
                j_count = 0
            else:
                j_count += 1
                if j_count > 5:
                    #5回続けて決定係数が下がった場合、変数を戻しbreak
                    j -= 5
                    p_model = SVC(kernel="rbf", gamma=i/100 , C=j/100, class_weight="balanced", random_state=0).fit(sc_x_train,y_train)
                    p_train_score = p_model.score(sc_x_train,y_train)
                    p_val_score = p_model.score(sc_x_val,y_val)
                    break

        p_model = SVC(kernel="rbf", gamma=i/100 , C=j/100, class_weight="balanced", random_state=0).fit(sc_x_train,y_train)
        p_train_score = p_model.score(sc_x_train,y_train)
        p_val_score = p_model.score(sc_x_val,y_val)

        if p_val_score > p_score:
            #決定係数が高ければ更新
            p_score = p_val_score
            i_count = 0
        else:
            i_count += 1
            if i_count > 5:
                #5回続けて決定係数が下がった場合、変数を戻しbreak
                i -= 5
                p_model = SVC(kernel="rbf", gamma=i/100 , C=j/100, class_weight="balanced", random_state=0).fit(sc_x_train,y_train)
                p_train_score = p_model.score(sc_x_train,y_train)
                p_val_score = p_model.score(sc_x_val,y_val)
                break

    return p_model,p_train_score,p_val_score

###RandomForestClassfierモデル

In [19]:
def rfc_model(sc_x_train,sc_x_val,y_train,y_val):
    p_score = -1
    i_count = 0

    #パラメータ調整
    for i in range(1,10001):
        j_count = 0
        for j in range(1,10001):
            p_model = RFC(criterion="entropy", n_estimators=i, random_state=1, n_jobs=j,class_weight="balanced").fit(sc_x_train,y_train)
            p_train_score = p_model.score(sc_x_train,y_train)
            p_val_score = p_model.score(sc_x_val,y_val)

            if p_val_score > p_score:
                #決定係数が高ければ更新
                p_score = p_val_score
                j_count = 0
            else:
                j_count += 1
                if j_count > 5:
                    #5回続けて決定係数が下がった場合、変数を戻しbreak
                    j -= 5
                    p_model = RFC(criterion="entropy", n_estimators=i, random_state=1, n_jobs=j,class_weight="balanced").fit(sc_x_train,y_train)
                    p_train_score = p_model.score(sc_x_train,y_train)
                    p_val_score = p_model.score(sc_x_val,y_val)
                    break

        p_model = RFC(criterion="entropy", n_estimators=i, random_state=1, n_jobs=j,class_weight="balanced").fit(sc_x_train,y_train)
        p_train_score = p_model.score(sc_x_train,y_train)
        p_val_score = p_model.score(sc_x_val,y_val)

        if p_val_score > p_score:
            #決定係数が高ければ更新
            p_score = p_val_score
            i_count = 0
        else:
            i_count += 1
            if i_count > 5:
                #5回続けて決定係数が下がった場合、変数を戻しbreak
                i -= 5
                p_model = RFC(criterion="entropy", n_estimators=i, random_state=1, n_jobs=j,class_weight="balanced").fit(sc_x_train,y_train)
                p_train_score = p_model.score(sc_x_train,y_train)
                p_val_score = p_model.score(sc_x_val,y_val)
                break

    return p_model,p_train_score,p_val_score

###最適モデル選択

In [20]:
def model_selection(sc_x_train,sc_x_val,y_train,y_val):
    print("▮▮▮▮▮▮|",end="")
    val_score,model,train_score,val_score = -1,-1,-1,-1

    #SGDClassifierモデル
    p_sgdc_model,p_sgdc_train_score,p_sgdc_val_score = sgdc_model(sc_x_train,sc_x_val,y_train,y_val)
    if p_sgdc_val_score > val_score:
        #決定係数が高ければ更新
        model = p_sgdc_model
        train_score = p_sgdc_train_score
        val_score = p_sgdc_val_score
   
    #LinearSVCモデル
    p_lsvc_model,p_lsvc_train_score,p_lsvc_val_score = lsvc_model(sc_x_train,sc_x_val,y_train,y_val)
    if p_lsvc_val_score > val_score:
        #決定係数が高ければ更新
        model = p_lsvc_model
        train_score = p_lsvc_train_score
        val_score = p_lsvc_val_score

    #KNeighborsClassifierモデル
    p_knc_model,p_knc_train_score,p_knc_val_score = knc_model(sc_x_train,sc_x_val,y_train,y_val)
    if p_knc_val_score > val_score:
        #決定係数が高ければ更新
        model = p_knc_model
        train_score = p_knc_train_score
        val_score = p_knc_val_score

    print("▮▮▮▮▮▮|",end="")

    #SVCモデル
    p_svc_model,p_svc_train_score,p_svc_val_score = svc_model(sc_x_train,sc_x_val,y_train,y_val)
    if p_svc_val_score > val_score:
        #決定係数が高ければ更新
        model = p_svc_model
        train_score = p_svc_train_score
        val_score = p_svc_val_score

    #RandomForestClassifierモデル
    p_rfc_model,p_rfc_train_score,p_rfc_val_score = rfc_model(sc_x_train,sc_x_val,y_train,y_val)
    if p_rfc_val_score > val_score:
        #決定係数が高ければ更新
        model = p_rfc_model
        train_score = p_rfc_train_score
        val_score = p_rfc_val_score

    print("▮▮▮▮▮▮|",end="")
    
    return model,train_score,val_score

###最適処理

In [21]:
def optimal(df,target):
    print("0%  20%  40%  60%  80%  100%\n|",end="")

    #特徴量、正解データ準備
    x_col = df.drop(target,axis = 1).columns
    x = df[x_col]
    t = df[[target]]
    val_score = -1
    
    #標準化
    sc_x_model,sc_x_train,sc_x_val,y_train,y_val = scaler(x,t)

    #最適モデル選択
    model,train_score,val_score = model_selection(sc_x_train,sc_x_val,y_train,y_val)

    print("▮▮▮▮▮▮|")
    print("選択モデル:{}\n訓練データ正解率:{}\t検証データ正解率:{}".format(model,round(train_score,3),round(val_score,3)))

    return model,sc_x_model

##実行文

###データフレームの作成

In [22]:
#データフレームの読み込み
df = pd.read_csv("mushrooms.csv",encoding = "cp932")

#正解データの列名
target = "class"

df.head(3)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,yes,pungent,free,close,narrow,black,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,yes,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,yes,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows


###ダミー変数化

In [23]:
#質的データ列の抽出
dum_col = df.select_dtypes("object").columns
dum_col = dum_col.drop(target)
if len(dum_col) > 0:
    df2 = pd.get_dummies(df[dum_col],drop_first=False)
    df2 = pd.concat([df,df2],axis=1)
    df2 = df2.drop(columns=dum_col)
else:
    #質的データがない場合変数名のみ変更
    print("ダミー変数化対象なし")
    df2 = df

df2.head(3)

Unnamed: 0,class,cap-shape_bell,cap-shape_conical,cap-shape_convex,cap-shape_flat,cap-shape_knobbed,cap-shape_sunken,cap-surface_fibrous,cap-surface_groovesmooth,cap-surface_scaly,...,population_scattered,population_several,population_solitary,habitat_grasses,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods
0,poisonous,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,edible,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,edible,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


###欠損値処理

In [24]:
df3 = df2

non = df3.isnull().sum()
print(non)

non_f = False

for i in range(len(non)):
    if non[i] > 0:
        non_f = True

if non_f:
    #欠損値のある列の検索
    non_col = df3.loc[:,df3.isnull().any()].columns

    #欠損値処理のための欠損値行の削除
    non_df = df3.dropna()

    #欠損値処理のための外れ値の除外
    non_df = outliers(non_df)

    #欠損値のある列分繰り返し
    for name in non_col:
        #欠損値処理のための最適処理
        model,sc_x_model= optimal(non_df,name)
        
        #欠損値処理のための特徴量、正解データ準備
        non_data = df3.loc[df3[name].isnull()]
        non_x_col = non_data.drop(name,axis = 1).columns
        non_x = non_data[non_x_col]
        non_t = non_data[[name]]

        #穴埋値の予測
        sc_non_x = sc_x_model.transform(non_x)
        pred = model.predict(sc_non_x)
        t_pred = []
        for val in pred[0]:
            t_pred.append(val)

        #欠損値の補完
        df3.loc[df3[name].isnull(),name] = t_pred
else:
    #欠損値がない場合変数名のみ変更
    print("欠損値処理対象なし")

df3.head(3)

class                0
cap-shape_bell       0
cap-shape_conical    0
cap-shape_convex     0
cap-shape_flat       0
                    ..
habitat_meadows      0
habitat_paths        0
habitat_urban        0
habitat_waste        0
habitat_woods        0
Length: 118, dtype: int64
欠損値処理対象なし


Unnamed: 0,class,cap-shape_bell,cap-shape_conical,cap-shape_convex,cap-shape_flat,cap-shape_knobbed,cap-shape_sunken,cap-surface_fibrous,cap-surface_groovesmooth,cap-surface_scaly,...,population_scattered,population_several,population_solitary,habitat_grasses,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods
0,poisonous,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,edible,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,edible,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


###訓練・検証データとテストデータに分割

In [25]:
train_val,test = train_test_split(df3,test_size=0.2,random_state=0)

###外れ値の除外

In [26]:
train_val2 = outliers(train_val)

外れ値除外対象なし


###学習・評価

In [None]:
#訓練・検証データの最適処理
model,sc_x_model = optimal(train_val2,target)

#テストデータの特徴量、正解データ準備
x_col = test.drop(target,axis = 1).columns
x_test = test[x_col]
t_test = test[[target]]

#テストデータの標準化
sc_x_test = sc_x_model.transform(x_test)

#テストデータの決定係数
test_score= model.score(sc_x_test,t_test)
print("テストデータ正解率:{}".format(round(test_score,3)))

0%  20%  40%  60%  80%  100%
|▮▮▮▮▮▮|▮▮▮▮▮▮|

###モデルの保存

In [None]:
with open("classifier.pkl","wb") as f:
    pickle.dump(model,f)

with open("sc_x_model","wb") as f:
    pickle.dump(sc_x_model,f)