# 3. ランダムフォレストによる教師あり学習

- このnotebookはsrc/script/fit_trees.pyを分解したもの
- コードの行数・工数ともに、モデリングの大半はそのデータ準備で占められる。データが出来上がってしまえば、ライブラリを用いて、ほんの数行でモデルを作成できる。

## データ準備

In [1]:
# src/data/db.py より
from contextlib import contextmanager
from sqlalchemy import MetaData, create_engine
from sqlalchemy.orm import sessionmaker

# DB接続用のエンジン作成
SQLITE_DB_PATH = "../data/raw/suzuki.db"
engine = create_engine(f'sqlite:///{SQLITE_DB_PATH}')

# セッションを作成。エンジンと結びつける。
Session = sessionmaker()
Session.configure(bind=engine)

# セッションをwith句で使えるように、コンテクストとして定義。
# エラーの場合にはDBにはコミットせずにロールバックし、操作が終わったら自動的にセッションを閉じる。
@contextmanager
def session_scope():
    """Provide a transactional scope around a series of operations."""
    session = Session()
    try:
        yield session
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()

In [2]:
# src/data/models.py　より

from sqlalchemy import Table

# メタデータをエンジンと結びつける
metadata = MetaData()
metadata.reflect(engine)

# テーブルデータをオブジェクトにマッピング
FTIRBaseInfo = Table('ftir_base_info', metadata, autoload=True, autoload_with=engine)
TYQ0110 = Table('TYQ0110', metadata, autoload=True, autoload_with=engine)
TYQ0120 = Table('TYQ0120', metadata, autoload=True, autoload_with=engine)
TYQ0160 = Table('TYQ0160', metadata, autoload=True, autoload_with=engine)
TYQ0210 = Table('TYQ0210', metadata, autoload=True, autoload_with=engine)
TYQ0220 = Table('TYQ0220', metadata, autoload=True, autoload_with=engine)
TYQ0650 = Table('TYQ0650', metadata, autoload=True, autoload_with=engine)
TYQ0810 = Table('TYQ0810', metadata, autoload=True, autoload_with=engine)

In [3]:
from sqlalchemy.sql import and_, func
import pandas as pd

# 全件についての部品情報を FPCR から取得したテーブルを作成する。

with session_scope() as session:
    query = (
        session
        .query(
            FTIRBaseInfo.c.F_ID,
            TYQ0110.c.G_FPCR_ID,
            func.substr(TYQ0110.c.G_CAUSAL_PARTS_NO, 1, 5).label('G_CAUSAL_PARTS_NO'),
            TYQ0110.c.G_CAUSAL_PARTS_NAME_PL,
        )
        .join(TYQ0210, FTIRBaseInfo.c.F_ID == TYQ0210.c.G_ID)
        .filter(TYQ0110.c.G_FPCR_ID == TYQ0210.c.G_FPCR_ID)
    )

    # 上記のクエリに、国内の四輪のデータのみに絞る条件を追加
    query = query.filter(
        and_(
            FTIRBaseInfo.c.F_REPORT_COUNTRY_CODE == 'JP',
            FTIRBaseInfo.c.F_PRODUCT_SPECIFICATION == '1'
        )
    )
    
    df = pd.read_sql(query.statement, query.session.bind)

In [4]:
INTERIM_PATH = "../data/interim/"
df.to_feather(f"{INTERIM_PATH}df_parts.feather")

In [5]:
df = pd.read_feather(f"{INTERIM_PATH}df_parts.feather")

In [6]:
df.head()

Unnamed: 0,F_ID,G_FPCR_ID,G_CAUSAL_PARTS_NO,G_CAUSAL_PARTS_NAME_PL
0,JP201705B04583,FPJP201611B00767,95200,COMPRESSOR ASSY ﾊﾟﾚｯﾄ
1,JP201707B02836,FPJP201601B03270,17521,"ﾍﾞﾙﾄ,ｳｵ-ﾀﾎﾟﾝﾌﾟ"
2,JP201705B00780,FPJP201601B03270,17521,"ﾍﾞﾙﾄ,ｳｵ-ﾀﾎﾟﾝﾌﾟ"
3,JP201711B00181,FPJP201604B03521,53401,"ｼﾘﾝﾀﾞｱﾂｼ,ﾎｲ-ﾙ"
4,JP201705B00786,FPJP201502B01871,55101,キヤリパ，フロントブレ?キ，ライト


In [7]:
df.shape

(217790, 4)

In [8]:
# G_CAUSAL_PARTS_NO毎にまとめて、最も出現頻度が高かったG_CAUSAL_PARTS_NAME_PLを取り出す。
# G_CAUSAL_PARTS_NOとG_CAUSAL_PARTS_NAME_PLをつなげてparts列を作成する
df_parts_no = (
    df[['G_CAUSAL_PARTS_NO', 'G_CAUSAL_PARTS_NAME_PL']]
    .groupby('G_CAUSAL_PARTS_NO')
    .agg(lambda x:x.value_counts().index[0])
    .reset_index()
    .assign(parts=lambda df: df.G_CAUSAL_PARTS_NO + ' ' + df.G_CAUSAL_PARTS_NAME_PL)
    .drop('G_CAUSAL_PARTS_NAME_PL', axis=1)
)

In [9]:
df_parts_no[:10]

Unnamed: 0,G_CAUSAL_PARTS_NO,parts
0,,
1,*,*
2,***,***
3,*****,*****
4,*.*,*.*
5,-,-
6,---,---
7,-54M2,-54M2 ホイ?ル，アルミ（１５Ｘ４１／２Ｊ）（シルバ?）
8,.,.
9,000-0,000-0 不明


In [10]:
# G_FPCR_IDと上記で作成したparts列を結合して、fpcr列をつくる
df_fpcr = (
    df[['F_ID', 'G_FPCR_ID', 'G_CAUSAL_PARTS_NO']]
    .merge(df_parts_no)
    .assign(fpcr=lambda df: df.G_FPCR_ID + '-' + df.parts)
)

In [11]:
df_fpcr.head()

Unnamed: 0,F_ID,G_FPCR_ID,G_CAUSAL_PARTS_NO,parts,fpcr
0,JP201705B04583,FPJP201611B00767,95200,95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ,FPJP201611B00767-95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ
1,JP200506B51396,FPJP200407B80019,95200,95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ,FPJP200407B80019-95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ
2,JP200507B50866,FPJP200407B80019,95200,95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ,FPJP200407B80019-95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ
3,JP200508B50961,FPJP200407B80019,95200,95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ,FPJP200407B80019-95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ
4,JP200509B50483,FPJP200407B80019,95200,95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ,FPJP200407B80019-95200 ｺﾝﾌﾟﾚﾂｻｱﾂｼ


In [12]:
# interim/FPCR.csv
df_fpcr.to_csv(f"{INTERIM_PATH}FPCR.csv", index=False)

In [13]:
# モデルの特徴量として追加する列
col_features = {'FTIRBaseInfo': [
        'F_ID',
        'F_SELLING_MODEL_SIGN',
        'F_FCOK',
        'F_MILEAGE',
        'F_MILEAGE_UNIT',
        'F_INFO_TRANS_NO',
        'F_INFO_FUEL',
        'F_INFO_INJECTION',
        'F_INFO_ENGINE_CHARGER',
        'F_INFO_VVT_FLG',
        'F_INFO_LEAN_BURN_FLG',
        'F_INFO_HYBRID_FLG',
        'F_INFO_DRIVE',
        'F_INFO_GEARBOX',
    ],
    'TYQ0210': [
        # 'G_CAUSAL_PARTS_NO'
    ]}

# モデルの対象範囲とする症状コード
target_trouble_code = ['84', '7V', '7W', '7X', '7Y', '7Z', '8A', '8B']

In [14]:
from itertools import chain
from sqlalchemy.sql import or_

with session_scope() as session:
    # 指定した列を取得するクエリ
    # 参考: chain.from_iterableは、入れ子のlistを一続きのlistにフラット化する
    # 参考: globas()はグローバル変数名をkey, 内容をvalueにもつ辞書を返す。
    query = (
        session
        .query(*chain.from_iterable(
            [getattr(globals()[table_name].c, col_name)
             for col_name
             in col_names
            ]
            for table_name, col_names
            in col_features.items()
        ))
    )
    
    # 上記のクエリに、国内の四輪のデータのみに絞る条件を追加
    query = query.filter(
        and_(
            FTIRBaseInfo.c.F_REPORT_COUNTRY_CODE == 'JP',
            FTIRBaseInfo.c.F_PRODUCT_SPECIFICATION == '1'
        )
    )
    
    # 上記のクエリに、TYQ0210経由でTYQ0110をinner joinした後、
    # TYQ0110のG_TROUBLE_COMPLAINT_CODE列を使用して、対象の症状コードのみに絞る
    query = (query
        .join(TYQ0210, FTIRBaseInfo.c.F_ID == TYQ0210.c.G_ID)
        .join(TYQ0110, TYQ0110.c.G_FPCR_ID == TYQ0210.c.G_FPCR_ID)
        .filter(
            and_(
                or_(
                    TYQ0110.c.G_TROUBLE_COMPLAINT_CODE == code
                    for code
                    in target_trouble_code
                )
            )
        )
    )
    
    df_additional = pd.read_sql(query.statement, query.session.bind)

In [15]:
df_additional.head()

Unnamed: 0,F_ID,F_SELLING_MODEL_SIGN,F_FCOK,F_MILEAGE,F_MILEAGE_UNIT,F_INFO_TRANS_NO,F_INFO_FUEL,F_INFO_INJECTION,F_INFO_ENGINE_CHARGER,F_INFO_VVT_FLG,F_INFO_LEAN_BURN_FLG,F_INFO_HYBRID_FLG,F_INFO_DRIVE,F_INFO_GEARBOX
0,JP201705B02705,MH55S-WFZB-NJ,20170408.0,6,1,74F08009,,,,,,,1.0,3.0
1,JP201705B02694,MB36S-MBXB-J,20160720.0,10779,1,67F19144,,,,,,,1.0,3.0
2,JP200410B50459,DA62V-EPAA-M6,,6314,1,,1.0,1.0,1.0,,,,1.0,2.0
3,JP200410B50462,DA62W-EPSJ-M4,,23264,1,,1.0,1.0,2.0,,,,2.0,2.0
4,JP200411B50140,RB21S-BFXA-S2,,30284,1,,,,,,,,,


In [16]:
df_additional.shape

(30395, 14)

In [17]:
import numpy as np

f_mileage = pd.to_numeric(df_additional.F_MILEAGE, errors='coerce')
df_additional = df_additional.assign(
    # F_FCOKは数値に
    F_FCOK=pd.to_numeric(df_additional.F_FCOK, errors='coerce'),

    # F_MILEAGEをキロ換算する
    F_MILEAGE=np.where(
        # Indicates F_MILEAGE is in Miles
        df_additional.F_MILEAGE_UNIT == '2',
        # Make it in Kilometers
        f_mileage * 1.609,
        f_mileage),

    # F_SELLING_MODEL_SIGNは1~6文字目のみを用いる
    F_SELLING_MODEL_SIGN=df_additional.F_SELLING_MODEL_SIGN.str[0:5],
)

if 'G_CAUSAL_PARTS_NO' in df_additional.columns:
    df_additional = df_additional.assign(
        # G_CAUSAL_PARTS_NOは1~6文字目のみを用いる
        G_CAUSAL_PARTS_NO=df_additional.G_CAUSAL_PARTS_NO.str[0:5],
    )

# F_MILEAGE_UNITは不要なので削除
df_additional = df_additional.drop('F_MILEAGE_UNIT', axis=1)

In [18]:
df_additional.head()

Unnamed: 0,F_ID,F_SELLING_MODEL_SIGN,F_FCOK,F_MILEAGE,F_INFO_TRANS_NO,F_INFO_FUEL,F_INFO_INJECTION,F_INFO_ENGINE_CHARGER,F_INFO_VVT_FLG,F_INFO_LEAN_BURN_FLG,F_INFO_HYBRID_FLG,F_INFO_DRIVE,F_INFO_GEARBOX
0,JP201705B02705,MH55S,20170408.0,6.0,74F08009,,,,,,,1.0,3.0
1,JP201705B02694,MB36S,20160720.0,10779.0,67F19144,,,,,,,1.0,3.0
2,JP200410B50459,DA62V,,6314.0,,1.0,1.0,1.0,,,,1.0,2.0
3,JP200410B50462,DA62W,,23264.0,,1.0,1.0,2.0,,,,2.0,2.0
4,JP200411B50140,RB21S,,30284.0,,,,,,,,,


In [19]:
!ls -lh ../data/processed

合計 748M
-rw-rw-r-- 1 user01 user01 378M  5月  9 23:48 df_doc_topics.csv
-rw-rw-r-- 1 user01 user01 178M  5月  9 21:55 df_doc_topics.xlsx
-rw-rw-r-- 1 user01 user01  96M  5月  9 23:45 df_topic_ranking.csv
-rw-rw-r-- 1 user01 user01  23M  5月  9 21:50 df_topic_ranking.xlsx
-rw-rw-r-- 1 user01 user01  35M  5月  9 23:49 df_topic_words.csv
-rw-rw-r-- 1 user01 user01  19M  5月  9 21:51 df_topic_words.xlsx
-rw-rw-r-- 1 user01 user01  22M  5月  9 08:14 lda_model.pkl


In [20]:
# LDAモデルで計算した、文章のトピック量一覧
df_topic = pd.read_csv("../data/processed/df_doc_topics.csv")

In [21]:
df_topic.head(2)

Unnamed: 0,id,doc,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_50,topic_51,topic_52,topic_53,topic_54,topic_55,topic_56,topic_57,topic_58,topic_59
0,JP200410B50001,高速走行中(トンネルの中)、車両前部より白煙(あるいは水蒸気)シートを起こしたところ炎を確認...,0.061556,0.008286,0.00634,0.006153,0.008337,0.00558,0.033003,0.029619,...,0.05547,0.031133,0.009908,0.0047,0.010765,0.008374,0.004746,0.005841,0.006615,0.005253
1,JP200410B50002,アイドリングの時にキンキンと高い音がする $ 診断中 $,0.026353,0.014173,0.010844,0.04999,0.014259,0.009544,0.016982,0.050659,...,0.015944,0.013784,0.016946,0.008039,0.018412,0.053788,0.008118,0.00999,0.011315,0.008984


In [22]:
# 予測したい変数
target_col = "parts"

df_topic_features = (
    df_topic
    .merge(df_additional, left_on='id', right_on='F_ID')
    .merge(df_fpcr[['F_ID', target_col]], left_on='id', right_on='F_ID')
    .drop(['F_ID_x', 'F_ID_y'], axis=1)
)

In [23]:
df_topic_features.head(3)

Unnamed: 0,id,doc,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,F_INFO_TRANS_NO,F_INFO_FUEL,F_INFO_INJECTION,F_INFO_ENGINE_CHARGER,F_INFO_VVT_FLG,F_INFO_LEAN_BURN_FLG,F_INFO_HYBRID_FLG,F_INFO_DRIVE,F_INFO_GEARBOX,parts
0,JP200410B50004,ディスクロータが錆びて走行時キーキー音がする $ ディスクロータが錆びて当たり面が凸凹になっ...,0.021307,0.011459,0.008768,0.008509,0.011529,0.039627,0.013731,0.00905,...,,1,1,1,,,,2,2,
1,JP200410B50010,A/Cアイドルアップ時のM/Tギア打音に対応し#90ギアオイル使用したところ、1stの抜けが...,0.012519,0.006733,0.005151,0.042496,0.006774,0.004534,0.008068,0.024066,...,,1,1,1,,,,1,1,
2,JP200410B50060,"1-2シフト時、3rdギア鳴り $ 6月末2,206km走行時、歯打ち音苦情があり「仕事で使...",0.029602,0.006374,0.004877,0.004733,0.006413,0.004293,0.007638,0.182533,...,,1,1,1,,,,1,1,


In [24]:
df_additional.head()

Unnamed: 0,F_ID,F_SELLING_MODEL_SIGN,F_FCOK,F_MILEAGE,F_INFO_TRANS_NO,F_INFO_FUEL,F_INFO_INJECTION,F_INFO_ENGINE_CHARGER,F_INFO_VVT_FLG,F_INFO_LEAN_BURN_FLG,F_INFO_HYBRID_FLG,F_INFO_DRIVE,F_INFO_GEARBOX
0,JP201705B02705,MH55S,20170408.0,6.0,74F08009,,,,,,,1.0,3.0
1,JP201705B02694,MB36S,20160720.0,10779.0,67F19144,,,,,,,1.0,3.0
2,JP200410B50459,DA62V,,6314.0,,1.0,1.0,1.0,,,,1.0,2.0
3,JP200410B50462,DA62W,,23264.0,,1.0,1.0,2.0,,,,2.0,2.0
4,JP200411B50140,RB21S,,30284.0,,,,,,,,,


In [25]:
# 追加した特徴量のうち、カテゴリ変数のもの一覧
categorical_cols = [
    'F_SELLING_MODEL_SIGN',
    'F_INFO_TRANS_NO',
    'F_INFO_FUEL',
    'F_INFO_INJECTION',
    'F_INFO_ENGINE_CHARGER',
    'F_INFO_VVT_FLG',
    'F_INFO_LEAN_BURN_FLG',
    'F_INFO_HYBRID_FLG',
    'F_INFO_DRIVE',
    'F_INFO_GEARBOX',
]

# カテゴリ変数をダミー化する
df_dummy_features = pd.get_dummies(
    df_additional,
    columns=categorical_cols,
    prefix_sep='=',
)

In [26]:
# ダミー化されたDataFrame。
df_dummy_features.head()

Unnamed: 0,F_ID,F_FCOK,F_MILEAGE,F_SELLING_MODEL_SIGN=,F_SELLING_MODEL_SIGN=AARF,F_SELLING_MODEL_SIGN=AAXJ,F_SELLING_MODEL_SIGN=ABEL,F_SELLING_MODEL_SIGN=ABEL-,F_SELLING_MODEL_SIGN=ABGF-,F_SELLING_MODEL_SIGN=ABGL,...,F_INFO_HYBRID_FLG=1,F_INFO_HYBRID_FLG=2,F_INFO_DRIVE=,F_INFO_DRIVE=1,F_INFO_DRIVE=2,F_INFO_GEARBOX=,F_INFO_GEARBOX=1,F_INFO_GEARBOX=2,F_INFO_GEARBOX=3,F_INFO_GEARBOX=4
0,JP201705B02705,20170408.0,6.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,JP201705B02694,20160720.0,10779.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,JP200410B50459,,6314.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,JP200410B50462,,23264.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,JP200411B50140,,30284.0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [27]:
for x in [df_additional, df_dummy_features]: print(x.shape)

(30395, 13)
(30395, 23194)


In [37]:
# トピック量と追加特徴量をinner join
df_topic_dummy_features = (
    df_topic
    .merge(df_dummy_features, left_on='id', right_on='F_ID')
    .merge(df_fpcr[['F_ID', target_col]], left_on='id', right_on='F_ID')
    .drop(['F_ID_x', 'F_ID_y'], axis=1)
    .dropna()
)

In [38]:
df_topic_dummy_features.head(2)

Unnamed: 0,id,doc,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,F_INFO_HYBRID_FLG=2,F_INFO_DRIVE=,F_INFO_DRIVE=1,F_INFO_DRIVE=2,F_INFO_GEARBOX=,F_INFO_GEARBOX=1,F_INFO_GEARBOX=2,F_INFO_GEARBOX=3,F_INFO_GEARBOX=4,parts
103,JP200412B50114,ハンドルを回すとこすれる音がする。 $ コラムより音が出ていた $,0.023563,0.04796,0.009696,0.00941,0.048037,0.008534,0.015185,0.010008,...,0,0,1,0,0,1,0,0,0,
108,JP200412B50177,未舗装路走行時に、左フロントから異音がする。 $ 左フロントストラットアッシから音が出ている...,0.054971,0.011837,0.009057,0.00879,0.044871,0.007971,0.014184,0.009349,...,0,0,1,0,0,0,1,0,0,41601.0


In [39]:
df_topic_dummy_features.parts.value_counts()[:10]

83401 ﾚｷﾞﾕﾚ-ﾀｱﾂｼ ﾌﾛﾝﾄｳｲﾝﾄﾞ　ﾗｲﾄ    1326
17521 ベルト，ウオ?タポンプ                 1151
27610                              929
85103 ﾌﾚ-ﾑｱﾂｼ                      917
84910 ﾊﾞﾂｸﾙｱﾂｼ ﾌﾛﾝﾄﾍﾞﾙﾄ            859
14271 ｽﾁﾌﾅ ｴｷｿﾞ-ｽﾄﾏﾆﾎ-ﾙﾄ           670
21000 ﾄﾗﾝｽﾐﾂｼﾖﾝｱﾂｼ CVT             506
48580 ﾎﾞﾂｸｽｱﾂｼ ｽﾃｱﾘﾝｸﾞｷ            454
86606 ｱｼﾞﾔｽﾀｱﾂｼ                    435
99999                              428
Name: parts, dtype: int64

G_CAUSAL_PARTS_NOとG_CAUSAL_PARTS_NAME_PLがなく、G_FPCR_IDのみ値がある(ex. 27610など)ものも多い

上記のようにして、分析用のデータテーブル(df_topic_dummy_features)ができた

In [40]:
df_topic_features.to_csv(f"{INTERIM_PATH}learning.csv", index=False)

In [41]:
df_topic_dummy_features.reset_index(drop=True).to_feather(f"{INTERIM_PATH}learning_dummy.feather")

In [42]:
import pandas as pd
INTERIM_PATH="../data/interim/"

In [43]:
df_topic_dummy_features = pd.read_feather(f"{INTERIM_PATH}learning_dummy.feather")

In [54]:
df_topic_dummy_features.head(2)

Unnamed: 0,id,doc,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,F_INFO_HYBRID_FLG=2,F_INFO_DRIVE=,F_INFO_DRIVE=1,F_INFO_DRIVE=2,F_INFO_GEARBOX=,F_INFO_GEARBOX=1,F_INFO_GEARBOX=2,F_INFO_GEARBOX=3,F_INFO_GEARBOX=4,parts
11565,JP201409B02394,助手席にすわるとキシキシと音が出る。 $ $,0.026353,0.014173,0.010844,0.010524,0.053725,0.009544,0.016982,0.011193,...,0,0,1,0,0,0,0,1,0,86606 ｱｼﾞﾔｽﾀｱﾂｼ
12272,JP201501B00421,左後からゴーと音がする。何故異音が発生してしまったのか、お客様から報告を求められています。 ...,0.022009,0.011837,0.009057,0.00879,0.011909,0.007971,0.014184,0.009349,...,0,1,0,0,1,0,0,0,0,46860


In [48]:
!ls -lh ../data/interim

合計 1.7G
-rw-rw-r-- 1 user01 user01  26M  5月 10 00:42 FPCR.csv
-rw-r--r-- 1 user01 user01  39M  5月  9 21:33 df.feather
-rw-r--r-- 1 user01 user01  36M  5月  9 21:36 df_filterd.feather
-rw-r--r-- 1 user01 user01  40M  5月  9 21:36 df_filterd_joined.feather
-rw-rw-r-- 1 user01 user01 949M  5月  9 21:38 df_parse_filterd.csv
-rw-r--r-- 1 user01 user01  17M  5月 10 00:42 df_parts.feather
-rw-rw-r-- 1 user01 user01  38M  5月 10 00:48 learning.csv
-rw-r--r-- 1 user01 user01 508M  5月 10 00:48 learning_dummy.feather


## モデリング

In [49]:
# ランダムフォレストモデルのハイパーパラメータ。
# n_estimators: 決定木の数
# class_weight: "balanced"を指定すると、モデル学習のための損失関数について、サンプル数の不均衡を補正してくれる
# random_sate: 乱数シード。この数字を固定しておかないと、結果が毎回若干異なる。
# max_features: 各決定木における、各分割に使用できる変数の割合。この割合の変数が、毎回ランダムに選択される。
# max_depth: 決定木の最大の深さ。7であれば、7回分割ができる。
# oob_score: これをTrueにしておくと、各決定木の作成に使用されなかったデータを元に、精度を計算できる。これはほぼ
#             validationデータにおける精度のように使用できることができる。
model_params = {
    'n_estimators': 500,
    'class_weight': 'balanced',
    'random_state': 0,
    'max_features': .50,
    'max_depth': 7,
    'oob_score': True}

In [56]:
%%time
from sklearn.ensemble import RandomForestClassifier
df_topic_dummy_features = df_topic_dummy_features.sample(frac=0.5, random_state=0)

# id, doc, 予測したい変数の列を特徴量から除く
df_X = df_topic_dummy_features.drop(['id', 'doc', target_col], axis=1)

# 目的変数(parts)はカテゴリ値なので、モデルで扱えるように数値に変更する。
levels_topics, labels_topics = pd.factorize(df_topic_dummy_features[target_col].values)

rf = RandomForestClassifier(**model_params, n_jobs=-1)
rf.fit(X=df_X, y=levels_topics)

CPU times: user 9min 28s, sys: 23.6 s, total: 9min 51s
Wall time: 3min 35s


In [58]:
# 推測モデルを保存
import pickle

with open("../data/processed/rf.pkl", "wb") as f:
    pickle.dump(rf, f)

In [59]:
rf.oob_score_

0.029496402877697843

In [67]:
1 / len(np.unique(levels_topics))

0.0027247956403269754