In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### 1. 課題
* 特徴量ベクトルの次元数が多い
* スパース（NULL値が多い）
* 異常・正常のデータ割合が極端に不均衡（異常検知問題）
* 評価関数が特殊(マシューズ相関係数)

### 2. 解決案
* 次元の削減
    * S32のデータのみを用いた特徴量ベクトルの作成
    * 情報量の少ないランダム or 特定の値に偏った属性削除
    * 決定木ベースで重要度の高い特徴量のみ採用
    * PCA
* カテゴリ変数の数値化＋低ビット化(必要最低限の低ビットな型指定での読み込み)
* XGBoost, LightGBMなどの利用（スパースなデータに強い）※Lasso回帰によるスパース推定は？
* 学習データを分割して複数モデルを生成し、アンサンブルモデルを作成
* オンライン学習の検討（特にカテゴリー属性の部分）

### 3. ベースライン開発
* データローダー：正常データ100000件＋異常データ6000件(時系列データは除く)
* モデル：LightGBM
* チューニング：Baysian Optimization
* 評価指標：マシューズ相関係数

### 4. 課題
* 次元数が多く情報量のあるデータが少ないため、モデルがオーバーフィッティングしてしまう。  
(学習データでは高精度が出てもテストデータでは全く精度が出ない)

### 5. TODO
* 不要な変数の削減による次元削除（S32を中心とした特徴量ベクトルの構成）
* Adversarial Validation
    - 訓練データとテストデータの分布を比較
    - テストデータに類似のバリデーションデータを作ってモデルを構築
* EDAによる不具合データと正常データ間の特徴比較＋特徴量の検討

<br>

In [None]:
# Seaborn and matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# Plotly library
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools

# Sklearn and lightgbm
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgbm

# Set some configurations
import gc
import time
import sys

# Import MCC and F1 score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import LabelEncoder

# 学習データとテストデータに含まれるすべてのカテゴリ変数。合計107個(NaN除く)。
# nan=0として、0～108の値にマッピングし、int8で取り込む形にする。
all_cats = ['NaN','T-18748192','T-2147481664','T-21474819','T-2147482176','T-2147482432',
            'T-21474825','T-2147482688','T-2147482816','T-2147482944','T-2147483646',
            'T-2147483647','T-2147483648','T-21474872','T1','T11141888','T1132',
            'T113776','T1152','T12','T12582912','T128','T1310','T132','T134217728',
            'T1372','T143','T145','T16','T16384','T16512','T16777216','T16777232',
            'T16777248','T16777472','T16777557','T16779428','T16793941','T178258',
            'T18436','T2','T24','T2516','T25165824','T256','T262144','T262656',
            'T268','T26808','T268435456','T3','T32','T32896','T331648','T33554432',
            'T33554448','T33554944','T36992','T393216','T3942','T4','T41944',
            'T4325376','T43968','T4718592','T48','T48576','T488','T492','T5','T512',
            'T514','T518','T52','T524288', 'T524544','T544','T55424','T56',
            'T589824','T6','T618624','T63616','T64','T6553','T65536','T678864',
            'T7','T748928','T7808','T786432','T786944','T8','T83888','T8389632',
            'T8651776','T86752','T8768','T8912896','T9','T917','T9174','T9175552',
            'T91764','T96','T96112','T97','T98']


le = LabelEncoder()
le = le.fit(all_cats)

def transformCategoricalDF(df, le):
    matrix = []
    for row in df.itertuples(name=None, index=None):
        #print(row)
        row = le.transform(row)
        matrix.append(row)
        del row
    out_df = pd.DataFrame(matrix,columns=df.columns).astype('int8')
    return out_df
    

In [None]:
s32_cat_cols = ['Id','L3_S32_F3851','L3_S32_F3853','L3_S32_F3854']
s32_num_cols = ['Id','Response','L3_S32_F3850']
s32_date_cols = ['Id','L3_S32_D3852']

In [None]:
# メモリサイズ表示関数
def getMmSize(obj):
    print(str(round(sys.getsizeof(obj)/1000000,2))+' MB')

# 数値データフレームの圧縮関数
def compressNumDF(df):
    id_r_df = df.loc[:,['Id','Response']]
    num_df = df.drop(['Id','Response'],axis=1).astype('float16')
    df = pd.concat([id_r_df, num_df],axis=1)
    return df

def lgb_mcc_score(y_hat, data):
    y_true = data.get_label() # ground truth
    y_hat = np.round(y_hat) # prediction
    return 'mcc', matthews_corrcoef(y_true, y_hat), True

# def lgb_mcc(preds, dtrain):
#     THRESHOLD = 0.5
#     labels = dtrain.get_label()
#     return 'mcc', matthews_corrcoef(labels, preds >= THRESHOLD)

In [None]:
# sys.exit()

In [None]:
#
# 学習データの読み込み処理（Station32に関連する特徴量に限定）
#
start_tm=time.time()
# Load categorical training data
train_cat = pd.read_csv('../input/bosch-production-line-performance/train_categorical.csv.zip', dtype=str,usecols=s32_cat_cols, nrows=300000, low_memory=False)
train_num = pd.read_csv('../input/bosch-production-line-performance/train_numeric.csv.zip', nrows=300000,usecols=s32_num_cols,low_memory=False)
train_date = pd.read_csv('../input/bosch-production-line-performance/train_date.csv.zip', nrows=300000,usecols=s32_date_cols,low_memory=False)
#result_df_reader = pd.read_csv('../input/bosch-production-line-performance/train_numeric.csv.zip',usecols=['Id','Response'], chunksize=100000)
end_tm=time.time()

elapse_time = end_tm - start_tm
print ("elapsed_time:{0}".format(elapse_time) + "[sec]")


In [None]:
train_date.head()

In [None]:
#
# カテゴリ変数データのラベルエンコーディング＋圧縮処理
#
tmp_df=train_cat.drop(['Id'],axis=1).fillna('NaN') # Id列の削除＋null値にNaNを設定
train_cat = transformCategoricalDF(tmp_df, le)

In [None]:
train_df = pd.concat([train_cat,train_num,train_date.drop(['Id'],axis=1)], axis=1)
train_df.head()

In [None]:
train_df.shape

# モデル構築ルーチン

In [None]:
# 学習データをラベルとデータ本体に分離
y = train_df['Response']
X = train_df.drop(['Id','Response'],axis=1)

lgb_params = {"objective":"binary", 
              #"metric":"binary_logloss",
              "metric": "None", # MCCで評価するために"None"を設定する必要あり
              "num_iterations":100,
              "learning_rate":0.1,
              "num_leaves":31,
              "seed":134,
              "max_depth":5,
              "min_data_in_leaf":20,
              "early_stopping_round":100,
              "verbosity": -1}
evals_result = {}

# 学習用データと検証用データに分割する
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42)

#print(X_train);
#print(y_train);
#LightGBM のデータセットの表現に直す
lgb_train = lgbm.Dataset(X_train, y_train)
lgb_eval = lgbm.Dataset(X_test, y_test, reference=lgb_train)

model = lgbm.train(lgb_params,
            lgb_train,
            # メトリックを追跡する対象のデータセット
            valid_sets=[lgb_eval, lgb_train],
            # 上記の名前
            valid_names=['eval', 'train'],
            num_boost_round=200,
            # メトリックの履歴を残すオブジェクト
            evals_result=evals_result,
            # 独自メトリックを計算する関数(MCCを設定)
            feval=lgb_mcc_score
          )

In [None]:
from sklearn import metrics

def get_evaluate(y_test, predict):

    fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict)

    auc = metrics.auc(fpr, tpr)
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)      

    return auc, precision, recall

predict_proba = model.predict(X_test, num_iteration=model.best_iteration)
# predict_probaが0.5以上なら1とする
predict = [0 if i < 0.5 else 1 for i in predict_proba]

auc, precision, recall = get_evaluate(y_test, predict)

print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))

In [None]:
predict_df = pd.DataFrame({'Response':predict})
predict_df.value_counts()

In [None]:
# 特徴量重要度の算出 (データフレームで取得)
cols = list(X.columns)         # 特徴量名のリスト
f_importance = np.array(model.feature_importance()) # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート

df_importance.head(20)

In [None]:
s32_cat_cols = ['Id','L3_S32_F3851','L3_S32_F3853','L3_S32_F3854']
s32_num_cols = ['L3_S32_F3850']
s32_date_cols = ['L3_S32_D3852']

# テストデータの読み込みとモデルによる評価結果の作成
# 10000行ずつ読み込んで、モデルで正常・異常判定を行い、結果を出力用のDataFrameに結合
df_cat_reader = pd.read_csv('../input/bosch-production-line-performance/test_categorical.csv.zip', dtype="str", usecols=s32_cat_cols, chunksize=10000)
df_num_reader = pd.read_csv('../input/bosch-production-line-performance/test_numeric.csv.zip',usecols=s32_num_cols, chunksize=10000)
df_date_reader = pd.read_csv('../input/bosch-production-line-performance/test_date.csv.zip',usecols=s32_date_cols, chunksize=10000)
# 出力用DataFrame
df_test_out = pd.DataFrame()
i=0
j=0
k=0
for df_cat in df_cat_reader:
    j=j+1
    print("Iteration j={0}".format(j))
    for df_num in df_num_reader:
        k=k+1
        print("Iteration k={0}".format(k))
        for df_date in df_date_reader:
            df_num = df_num.reset_index(drop=True)
            df_date = df_date.reset_index(drop=True)
            i = i+1
            print("Iteration {0}".format(i))
            start_tm=time.time()
            tmp_df = df_cat['Id'].reset_index(drop=True)
            df_a = df_cat.drop(['Id'],axis=1).fillna('NaN')
            end_tm=time.time()
            print("elapsed time:{0}".format(end_tm-start_tm))
            start_tm=time.time()
            df_b = transformCategoricalDF(df_a, le)
            end_tm=time.time()
            print("elapsed time:{0}".format(end_tm-start_tm))
            start_tm=time.time()
            test_df = pd.concat([df_b,df_num,df_date],axis=1)
            predict_proba = model.predict(test_df, num_iteration=model.best_iteration)
            # predict_probaが0.5以上なら1とする
            predict = [0 if i < 0.03 else 1 for i in predict_proba]
            predict_df = pd.DataFrame({'Response':predict})
            tmp_df = pd.concat([tmp_df,predict_df],axis=1)
            print(tmp_df.shape)
            df_test_out = pd.concat([df_test_out,tmp_df])
            end_tm=time.time()
            print("elapsed time:{0}".format(end_tm-start_tm))
            del tmp_df, df_a, df_b, test_df, predict_df
            break;
        break;

# Indexを修正
df_test_out = df_test_out.reset_index(drop=True)

# 提出用のCSVファイルに結果を保存
df_test_out.to_csv("submission.csv", index=False)

#del df_test_out
#gc.collect()

In [None]:
end_tm=time.time()

elapse_time = end_tm - start_tm
print ("elapsed_time:{0}".format(elapse_time) + "[sec]")

sys.exit()

In [None]:
df_test_out['Response'].value_counts()

In [None]:
end_tm=time.time()

elapse_time = end_tm - start_tm
print ("elapsed_time:{0}".format(elapse_time) + "[sec]")

sys.exit()

In [None]:
import matplotlib.pyplot as plt

# --- 学習結果をグラフで描画 ---
plt.figure(figsize=(8,6))
train_metric = evals_result['train']['mcc']
plt.plot(train_metric, label='train mcc')
eval_metric = evals_result['eval']['mcc']
plt.plot(eval_metric, label='eval mcc')
plt.grid()
plt.legend()
plt.xlabel('rounds')
plt.ylabel('mcc')
plt.show()

In [None]:
sys.exit()

## テストデータのラベルエンコーディングデータ生成＋保存
テストデータのラベルエンコーディングに時間がかかるため、エンコード処理したデータを利用する形にする。(処理時間：約120分)

In [None]:
# 10000行ずつ読み込んで、モデルで正常・異常判定を行い、結果を出力用のDataFrameに結合
df_reader = pd.read_csv('../input/bosch-production-line-performance/test_categorical.csv.zip', dtype="str", chunksize=10000)
# 出力用DataFrame
df_test_out = pd.DataFrame()
i=0
for df in df_reader:
    i = i+1
    start_tm=time.time()
    tmp_df = df['Id'].reset_index(drop=True)
    df_a = df.drop(['Id'],axis=1).fillna('NaN')
    df_b = transformCategoricalDF(df_a, le)
    tmp_df = pd.concat([tmp_df,df_b],axis=1)
    df_test_out = pd.concat([df_test_out,tmp_df])
    end_tm=time.time()
    print("Iter{0}: {1} sec".format(i, round(end_tm-start_tm,2)), end=", ")
    del tmp_df, df_a, df_b
    # if i==2: break;

# Indexを修正
df_test_out = df_test_out.reset_index(drop=True)


# 提出用のCSVファイルに結果を保存, 2.6GB
#df_test_out.to_csv("test_cat_encoded.csv", index=False)
df_test_out.to_csv("test_cat_encoded.csv.zip", index=False, compression="zip")