In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import gc
import sys
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score,precision_score,plot_precision_recall_curve,confusion_matrix,classification_report,matthews_corrcoef

import warnings
warnings.filterwarnings("ignore")

# FEATURE ENGINEERING

### The list of numeric features is selected based on XGBOOST classifier

In [None]:
numeric_chunk = pd.read_csv("../input/bosch-production-line-performance/train_numeric.csv.zip",chunksize=50000)

model = xgb.XGBClassifier(n_estimators=100)
ctr=0
for chunk in numeric_chunk:
    model.fit(chunk.drop(['Id','Response'],axis=1),chunk['Response'])
    ctr+=1
    print(ctr)

fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model,ax =ax,max_num_features=40)
np.set_printoptions(suppress = True)
imp = model.feature_importances_
name =model.get_booster().feature_names
pd.DataFrame({'feat':name,'impo':imp}).sort_values(by='impo',ascending=False)['feat'].head(100).values

In [None]:
# extract features by feature_importance
num_feats = ['Id',
       'L1_S24_F1632', 'L0_S21_F497', 'L0_S21_F477', 'L1_S24_F1122',
       'L1_S24_F1490', 'L1_S24_F1763', 'L3_S33_F3855', 'L1_S24_F1594',
       'L3_S29_F3360', 'L0_S0_F18', 'L3_S29_F3482', 'L0_S11_F322',
       'L3_S32_F3850', 'L0_S0_F8', 'L3_S29_F3327', 'L0_S6_F122',
       'L0_S2_F48', 'L3_S29_F3336', 'L0_S22_F586', 'L1_S24_F1068',
       'L0_S12_F350', 'L0_S0_F20', 'L0_S15_F406', 'L0_S11_F310',
       'L1_S24_F1647', 'L0_S6_F132', 'L0_S22_F596', 'L0_S11_F314',
       'L3_S35_F3896', 'L3_S29_F3436', 'L1_S24_F839', 'L0_S5_F114',
       'L0_S18_F449', 'L0_S18_F439', 'L0_S11_F294', 'L2_S26_F3077',
       'L3_S30_F3514', 'L3_S30_F3564', 'L3_S29_F3430', 'L3_S38_F3952',
       'L0_S3_F84', 'L3_S29_F3330', 'L0_S9_F200', 'L0_S3_F68',
       'L0_S3_F72', 'L1_S24_F1723', 'L2_S27_F3144', 'L2_S26_F3051',
       'L0_S0_F4', 'L0_S11_F318', 'L3_S30_F3634', 'L0_S2_F36',
       'L3_S43_F4090', 'L0_S1_F28', 'L3_S36_F3918', 'L0_S23_F655',
       'L0_S10_F224', 'L0_S21_F522', 'L0_S3_F96', 'L0_S1_F24',
       'L0_S11_F302', 'L3_S30_F3749', 'L2_S26_F3062', 'L2_S26_F3047',
       'L3_S36_F3920', 'L3_S30_F3809', 'L0_S10_F244', 'L0_S9_F165',
       'L0_S15_F397', 'L0_S16_F426', 'L1_S25_F2126', 'L1_S24_F1798',
       'L3_S30_F3784', 'L3_S30_F3794', 'L0_S2_F64', 'L1_S24_F1818',
       'L3_S50_F4243', 'L0_S12_F334', 'L3_S30_F3799', 'L3_S30_F3764',
       'L3_S30_F3624', 'L0_S22_F581', 'L1_S24_F1381', 'L3_S29_F3382',
       'L3_S29_F3376', 'L3_S29_F3479', 'L0_S12_F340', 'L0_S3_F80',
       'L0_S0_F2', 'L3_S29_F3379', 'L1_S24_F1808', 'L0_S12_F352',
       'L0_S9_F180', 'L0_S21_F527', 'L3_S33_F3857', 'L0_S9_F160',
       'L0_S22_F611', 'L0_S0_F12', 'L3_S30_F3519', 'L0_S11_F326',
 'Response']


In [None]:
chunksize = 500000

In [None]:
date = pd.read_csv('../input/bosch-production-line-performance/train_date.csv.zip', nrows=10000)
length = date.drop('Id', axis=1).count()

date_cols = length.reset_index().sort_values(by=0, ascending=False)

# s 부분만 얻음
stations = sorted(date_cols['index'].str.split('_',expand=True)[1].unique().tolist())

# date_cols.station -> Sij만 표시(line, date는 집중 x)
date_cols['station'] = date_cols['index'].str.split('_',expand=True)[1]

# state가 안겹치게 중복 제거후 추출
date_cols = date_cols.drop_duplicates('station', keep='first')['index'].tolist()

In [None]:
def date_state(data, location):
    for chunk in pd.read_csv(location ,usecols=['Id'] + date_cols,chunksize=50000,low_memory=False):
        chunk.columns = ['Id'] + stations
        chunk['start_station'] = -1
        chunk['end_station'] = -1

        for s in stations:
            # binary
            chunk[s] = 1 * (chunk[s] >= 0)
            # 해당 state에 대해 값이 null이 아닌 id 추출
            id_not_null = chunk[chunk[s] == 1].Id
            # 아직 start station이 정해지지 않고 state값이 null이 아닌 값에 대해서 start station을 해당 상태 숫자로 결정
            chunk.loc[(chunk['start_station']== -1) & (chunk.Id.isin(id_not_null)),'start_station'] = int(s[1:])
            # end는 state값이 null이 아니면 설정(계속 값이 갱신되어 최종 값 산출(start랑 조건이 다른 이유))
            chunk.loc[chunk.Id.isin(id_not_null),'end_station'] = int(s[1:])
        data = pd.concat([data, chunk])
    
    return data

In [None]:
train_date_loc = '../input/bosch-production-line-performance/train_date.csv.zip'
test_date_loc = '../input/bosch-production-line-performance/test_date.csv.zip'
dataset = None

dataset = date_state(dataset, train_date_loc)
dataset = date_state(dataset, test_date_loc)
gc.collect()

data = dataset[['Id','start_station','end_station']]
usefuldatefeatures = ['Id']+date_cols

In [None]:
def minmax(minmaxfeatures, location, useful):
    for chunk in pd.read_csv(location ,usecols=useful,chunksize=50000,low_memory=False):
        features = chunk.columns.values.tolist()
        features.remove('Id')
        df_mindate_chunk = chunk[['Id']].copy()
        df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values
        df_mindate_chunk['maxdate'] = chunk[features].max(axis=1).values
        df_mindate_chunk['min_time_station'] = chunk[features].idxmin(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1)
        df_mindate_chunk['max_time_station'] = chunk[features].idxmax(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1)
        minmaxfeatures = pd.concat([minmaxfeatures, df_mindate_chunk])
    return minmaxfeatures

In [None]:
minmax_dataset = None
minmax_dataset = minmax(minmax_dataset, train_date_loc, usefuldatefeatures)
minmaxfeatures = minmax(minmax_dataset, test_date_loc, usefuldatefeatures)

# mindate, id 기준 정렬 후 id값 차이 추출
minmaxfeatures.sort_values(by=['mindate', 'Id'], inplace=True)
minmaxfeatures['min_Id_rev'] = -minmaxfeatures.Id.diff().shift(-1)
minmaxfeatures['min_Id'] = minmaxfeatures.Id.diff()

cols = [['Id']+date_cols,num_feats]

In [None]:
def concat_files(files):
    data = None
    for i, f in enumerate(files):    
        subset = None

        for chunk in pd.read_csv('../input/bosch-production-line-performance/' + f,usecols=cols[i],chunksize=100000,low_memory=False):
            subset = pd.concat([subset, chunk])

        if data is None:
            data = subset.copy()
        else:
            data = pd.merge(data, subset.copy(), on="Id")

    return data

In [None]:
trainfiles = ['train_date.csv.zip','train_numeric.csv.zip']
testfiles = ['test_date.csv.zip','test_numeric.csv.zip']

traindata = concat_files(trainfiles)
# 'Response' -> not in test dataset
del cols[1][-1]
testdata = concat_files(testfiles)

gc.collect()

In [None]:
traindata = traindata.merge(minmaxfeatures, on='Id')
traindata = traindata.merge(data, on='Id')
testdata = testdata.merge(minmaxfeatures, on='Id')
testdata = testdata.merge(data, on='Id')

In [None]:
del minmaxfeatures,data
gc.collect()

In [None]:
traindata.fillna(value=0,inplace=True)
testdata.fillna(value=0,inplace=True)

In [None]:
def mcc(tp, tn, fp, fn):
    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if den == 0:
        return 0
    else:
        return num / np.sqrt(den)

In [None]:
def eval_mcc(y_true, y_prob):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) 
    numn = n - nump 
    tp,fp = nump,numn
    tn,fn = 0.0,0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    return best_mcc

In [None]:
def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc

In [None]:
np.set_printoptions(suppress=True)

In [None]:
total2 = traindata[traindata['Response']==0].sample(frac=0.5, random_state = 12) # 1176868 rows
total = pd.concat([total2,traindata[traindata['Response']==1]]) # 6879 rows

In [None]:
# X,y = total.drop(['Response','Id'],axis=1),total['Response'] -> 기존
X,y = total.drop(['Response','Id', 'min_Id_rev', 'min_Id'],axis=1),total['Response']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# MODELING

In [None]:
model = RandomForestClassifier(n_estimators=500,n_jobs=-1,verbose=1,random_state=11)
model.fit(X_train,y_train)
pred = model.predict(X_test)

In [None]:
print(classification_report(pred,y_test))
print(matthews_corrcoef(y_test,pred))
confusion_matrix(y_test,pred)

In [None]:
print(recall_score(y_test,pred))
print(precision_score(y_test,pred))

In [None]:
plot_precision_recall_curve(model,X_test,y_test)

In [None]:
# test = model.predict(testdata.drop(['Id'],axis=1)) -> 기
test = model.predict(testdata.drop(['Id', 'min_Id_rev', 'min_Id'],axis=1))

In [None]:
testdata['Response'] = test
testdata[['Id','Response']].to_csv("submit.csv",index=False)