# Imports

In [None]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix

import traceback

import re


In [None]:
df_names = ["S"+str(i) for i in range(52)]
[df_names.append("L"+str(i)) for i in range(4)] 
df_names.append("all")
database_file = "bosch_data.h5"

# Load Data from CSV Files

The idea is, since the read_csv function in pandas is very slow, and it gets even slower if you want to engenieer features, to store the dataset in a HDF5 table in a file called data.h5

The HDF5 table will have 57 subtables. One for each of the 52 stations, one for each of the 4 lines and one for all stations. In the sub tables only the engineered data is stored. 

To load the data from the csv files, three helper functions were needed. Each of these functions gets a chunk from the according table and a station or line name. It can also be "all".
Then each function processes the sub_chunk (e.g. only the chunk with station S0) accordingly

* process_numeric_chunk: adds the min, max, and mean value of the numerical data for the station or line. The Response will be stored in the "all" table .
* process_date_chunk: adds min, max, first, last, number of NAs and duration to each line. Each timedata will also be stored in the station
* process_cateforical: just removes empty rows and returns the data 

In [None]:
def process_numeric_chunk(chunk_numeric, name):
    # select feature columns 
    if name == "all":
        sub_columns = chunk_numeric.columns[chunk_numeric.columns.str.contains(r'L\d+_S\d+_F\d+.*')] 
    else:
        sub_columns = chunk_numeric.columns[chunk_numeric.columns.str.contains(r'.*'+name+'_.*')]


    # select correct subchunk and drop empty rows from subchunk
    sub_chunk = chunk_numeric[sub_columns]
    sub_chunk.dropna(how="all", inplace=True)

    # engineer features
    sub_chunk[name+"_numerical_min"] = sub_chunk.min(axis=1)
    sub_chunk[name+"_numerical_max"] = sub_chunk.max(axis=1)
    sub_chunk[name+"_numerical_mean"] = sub_chunk.mean(axis=1)

    # only keep engineered features for lines and all
    if "L" ==name[0] or name=="all":
        
        sub_chunk.drop(sub_chunk.columns[sub_chunk.columns.str.contains(r'L\d+_S\d+_F\d+.*')], inplace = True, axis=1)

    # add Response Column to all
    if name == "all":
        sub_chunk["Response"] = chunk_numeric["Response"]

    return sub_chunk


def process_row(row):
    first = row[row.first_valid_index()]
    last = row[row.last_valid_index()]
    NAs = row.isnull().sum()
    return first, last, NAs


def process_date_chunk(chunk_date,name):
    if name == "all":
        sub_columns = chunk_date.columns[chunk_date.columns.str.contains(r'L\d+_S\d+_D\d+.*')] 
    else:
        sub_columns = chunk_date.columns[chunk_date.columns.str.contains(r'.*'+name+'_.*')]

    # select correct subchunk and drop empty rows from subchunk 
    sub_chunk = chunk_date[sub_columns]
    sub_chunk.dropna(how="all", inplace=True)
    if sub_chunk.empty:
        return sub_chunk
    if name[0] == "S":
        return sub_chunk

    sub_chunk[name+"_date_min"] = sub_chunk.min(axis=1)
    sub_chunk[name+"_date_max"] = sub_chunk.max(axis=1)    
    sub_chunk[name+"_first"], sub_chunk[name+"_last"], sub_chunk[name+"_NAs"] = zip(*sub_chunk[sub_columns].apply(process_row, axis=1))
    sub_chunk[name+"_time"]=sub_chunk[name+"_date_max"]-sub_chunk[name+"_date_min"]
    sub_chunk.drop(sub_chunk.columns[sub_chunk.columns.str.contains(r'L\d+_S\d+_D\d+.*')], inplace = True, axis=1)
    return sub_chunk


def process_categorical_chunk(chunk_categorical, name):
    if name[0] == "S":
        sub_columns = chunk_categorical.columns[chunk_categorical.columns.str.contains(r'.*'+name+'_.*')]
        sub_chunk = chunk_categorical[sub_columns]
        sub_chunk.dropna(how="all", inplace=True)
        return sub_chunk
    return None



In the following block the programm will always take a chunk out of each dataset process it, join the processed data and append it to the according HDF table. **This will take quite some time. But will make future experiments much easier and faster to load the data.** It took me on my machine around half an hour. The resulting file will have a size of approximatly 10 GB.

In [None]:
store =  pd.HDFStore(database_file)

chunksize = 10 ** 5

number_of_lines_to_load=12*10**5

data_dir = "input/"

header_numeric = pd.read_csv(data_dir+"train_numeric.csv.zip", nrows= 1 ,engine="c",compression="zip").columns
header_date = pd.read_csv(data_dir+"train_date.csv.zip", nrows= 1 ,engine="c",compression="zip").columns
header_categorical = pd.read_csv(data_dir+"train_categorical.csv.zip", nrows= 1 ,engine="c",compression="zip").columns

convert = lambda error: float(error[1:]) if error != ""  else np.nan
converter = {cat:convert for cat in header_categorical[1:]}
try: 
    for loaded in range(1,number_of_lines_to_load,chunksize):
        

        # load data
        chunk_numeric = pd.read_csv(data_dir+"train_numeric.csv.zip", skiprows = loaded, nrows= chunksize ,engine="c", names = header_numeric, index_col="Id",compression="zip")
        chunk_date = pd.read_csv(data_dir+"train_date.csv.zip", skiprows = loaded, nrows= chunksize, engine="c", names = header_date, index_col="Id",compression="zip")
        chunk_categorical = pd.read_csv(data_dir+"train_categorical.csv.zip", skiprows = loaded, nrows= chunksize , 
            names = header_categorical,index_col="Id",dtype="object",engine="c", converters=converter,compression="zip")# dtype="float",  sep = r',T*')

        for name in df_names:            
            
            # process data
            numeric_processed = process_numeric_chunk(chunk_numeric, name)
            date_processed = process_date_chunk(chunk_date, name)
            categorical_processed = process_categorical_chunk(chunk_categorical, name)
            
            # join processed data
            processed_joined = numeric_processed.join(date_processed, on="Id")
            if categorical_processed is not None:
                processed_joined = processed_joined.join(categorical_processed, on="Id")

            # append to previously loaded data
            if processed_joined is not None:
                store.append(name,processed_joined.iloc[:-1,:])
        # early stopping criteria if loading takes too long
        print(loaded)
        if loaded>=number_of_lines_to_load:
            break

    store.close()
except Exception as e:
    traceback.print_exc()

    store.close()


Next there will be some additional features added. First the time difference between each part the last (time_dt) and the next(time_idt) is measured. Then same is done with the difference in NAs.
The column P1 ord and group_len are connected. P1 is a bool value and describes wether the row is part of a bigger part. For example if the next three parts have the same starting time P1 will be True for all of them. How many parts are in this bigger part is represented by the attribute group_len. And ord describes what number the part has in a subpart.

In [None]:
store =  pd.HDFStore(database_file)
try:
    for name in df_names[-5:]:
        chunk = store.get(name)
        chunk[name+"_time_dt"] = chunk[name+"_time"]- chunk[name+"_time"].shift(1)
        chunk[name+"_time_idt"]= chunk[name+"_time"]- chunk[name+"_time"].shift(-1)
        chunk[name+"_NAs_dt"] = chunk[name+"_NAs"]- chunk[name+"_NAs"].shift(1)
        chunk[name+"_NAs_idt"]= chunk[name+"_NAs"]- chunk[name+"_NAs"].shift(-1)
        
        store.put(name,value=chunk, format="table")

    table_date = store.get("all")

    table_date["P1"] = np.logical_or(table_date["all_first"] == table_date["all_first"].shift(1), table_date["all_first"] == table_date["all_first"].shift(-1))

    i = 1
    p1 = table_date[table_date["P1"]==True]
    p1["ord"] = np.logical_and(table_date["P1"],table_date["P1"].shift(1)==0)*1

    while(len(p1[(p1["ord"]==0)])>0):
        left = p1[(p1["ord"]==0)&(p1["ord"].shift(1) ==i)]
        left["ord"]=i+1
        p1.update(left)
        i+=1
    p1["group_len"] = np.nan
    p1["group_len"] = p1[p1["ord"].shift(-1)==1]["ord"]
    p1.fillna(method="bfill", inplace=True)

    table_date["ord"] = 0
    table_date["group_len"] = 0
    table_date.update(p1)
    table_date["P1"]=table_date["P1"].astype(bool)
    store.put("all",value=table_date, format="table")
except Exception as e:
    traceback.print_exc()
    store.close()

    

# Loading data

When we stored all the data in the HDF table we have to load it again. Because all of the HDF will probably be too big for memory. We need to select which tables we load first, then we can only load the columns of interest and concat to a DataFrame.

In [None]:
columns = ['Response','P1', 'ord', 'group_len', 'all_first','L3_time',
                   #'S32_mean', 'S33_mean', 'S38_mean',
                   'L3_first', 'L3_last', 'L3_time_dt', 'L3_time_idt',
                   'L3_NAs_dt', 'L3_NAs_idt',
                   'L3_S32_F3851', 'L3_S32_F3853', 'L3_S32_F3854',
                   'L1_S24_F1846', 'L3_S32_F3850', 'L1_S24_F1695', 
                   'L1_S24_F1632', 'L3_S33_F3855', 'L1_S24_F1604',
                   'L3_S29_F3407', 'L3_S33_F3865', 'L3_S38_F3952', 
                   'L1_S24_F1723']

# select which table to load from HDF
load = {"all":[]}
for col in columns:
    if col in ['Response','P1', 'ord', 'group_len']:
        load["all"].append(col)
    for name in df_names:
        # if feature is from station get the station 
        if name +"_" in col:
            if name not in load.keys():
                load[name]=[]
            load[name].append(col)

store =  pd.HDFStore(database_file)
table = pd.DataFrame()
try:
    for name in load.keys():

        if table.empty:
            table = store.select(name, columns = load[name])
        else:
            table = pd.concat([table,store.select(name, columns=load[name])], axis=1)
    store.close()
except Exception as e:
    traceback.print_exc()
    store.close()
table


# Training with xgboost

First we need to spilt out data in Training and test data and initialize the DMatrices for xgboost.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(table.drop(["Response"], axis=1),table["Response"],test_size=0.2, random_state=4, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2, random_state=4, shuffle=True)

dtrain  = xgb.DMatrix(X_train,y_train)
dval = xgb.DMatrix(X_val, y_val)
dtest = xgb.DMatrix(X_test, y_test)

Next we need to set the hyper parameters for xgboost and start training.

In [None]:
num_round = 65
evallist = [ (dtrain, 'train'),(dval, 'eval')]
param = {"nthread": 8, "max_depth" : 10, "eta":0.1, "subsample" : 0.9, "colsample_bytree" : 0.5,
                  "objective": "binary:hinge", "booster":"gbtree"}
bst = xgb.train(param, dtrain, num_round,evallist,early_stopping_rounds = 5)

Last the prediction is calculated as well as the matthews correlation coefficiant and the confusion matrix

In [None]:
y_pred = bst.predict(dtest)
#prob = np.sort(y_pred)[-int(len(y_pred)*0.006)]
#y_pred = y_pred>prob
print("Matthews Correlation Coeffinciant:",matthews_corrcoef(y_test,y_pred))
c = confusion_matrix(y_test,y_pred)
c

In [None]:
xgb.plot_importance(bst, max_num_features=20)