In [5]:
import numpy as np
import pandas as pd
import gc
import lightgbm as lgb
from sklearn.model_selection import train_test_split


#print("Read Done")
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

''' Modify data type to reduce Memory Usage

'''
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
 
''' mapping y(event)'s data type: ABCE to 1234

'''
def featureModify(isTrain, numRows):
    if isTrain:
        df = pd.read_csv('./input/fatalities/train.csv',nrows=numRows) 
        df = reduce_mem_usage(df)
        df['event'] = df['event'].map({
            'A':0,
            'B':1,
            'C':2,
            'D':3
        })
    else:
        df = pd.read_csv('./input/fatalities/test.csv',nrows=numRows)
        df = reduce_mem_usage(df)
        
    return df 
   
train = featureModify(True, None)
y = train['event']
train = train.drop('event',axis=1)
print(train.shape)
print(train.columns)

'''train test divide:: 0.75:0.25
'''
train, train_test, y, y_test = train_test_split(train, y, test_size=0.25, shuffle=False)
train = lgb.Dataset(train, label=y.astype('int32'),categorical_feature=[1])
train_test_df = train_test
y_df = y
del y
gc.collect()
train_test = lgb.Dataset(train_test, label=y_test.astype('int32'),categorical_feature=[1])
y_test_df = y_test
del y_test
gc.collect()

''' parameter
'''
params = {
        "objective" : "multiclass", 
        "metric" : "multi_error", 
        'num_class':4,
        "num_leaves" : 30, 
        "learning_rate" : 0.01, 
        "bagging_fraction" : 0.9,
        "bagging_seed" : 0, 
        "num_threads" : 4,
        "colsample_bytree" : 0.5,
        'min_data_in_leaf':100, 
        'min_split_gain':0.00019
}
model = lgb.train(  params, 
                    train_set = train,
                    num_boost_round=2000,
                    early_stopping_rounds=200,
                    verbose_eval=100, 
                    valid_sets=[train,train_test]
                  )


test = featureModify(False, None)
print("Done test read")
df_sub = pd.DataFrame()
df_sub['id'] = test['id']
test = test.drop('id',axis=1)

y_pred = model.predict(test, num_iteration=model.best_iteration)

df_sub = pd.DataFrame(np.concatenate((np.arange(len(test))[:, np.newaxis], y_pred), axis=1), columns=['id', 'A', 'B', 'C', 'D'])
df_sub['id'] = df_sub['id'].astype(int)

print(df_sub)
df_sub.to_csv("./submission/fatalities_submission3.csv", index=False)



Memory usage of dataframe is 1039.79 MB
Memory usage after optimization is: 241.38 MB
Decreased by 76.8%
(4867421, 27)
Index(['crew', 'experiment', 'time', 'seat', 'eeg_fp1', 'eeg_f7', 'eeg_f8',
       'eeg_t4', 'eeg_t6', 'eeg_t5', 'eeg_t3', 'eeg_fp2', 'eeg_o1', 'eeg_p3',
       'eeg_pz', 'eeg_f3', 'eeg_fz', 'eeg_f4', 'eeg_c4', 'eeg_p4', 'eeg_poz',
       'eeg_c3', 'eeg_cz', 'eeg_o2', 'ecg', 'r', 'gsr'],
      dtype='object')




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6123
[LightGBM] [Info] Number of data points in the train set: 3650565, number of used features: 27




[LightGBM] [Info] Start training from score -0.552255
[LightGBM] [Info] Start training from score -3.766158
[LightGBM] [Info] Start training from score -1.044389
[LightGBM] [Info] Start training from score -3.009791
Training until validation scores don't improve for 200 rounds
[100]	training's multi_error: 0.0572889	valid_1's multi_error: 0.0838522
[200]	training's multi_error: 0.0408605	valid_1's multi_error: 0.0809833
[300]	training's multi_error: 0.0338331	valid_1's multi_error: 0.0813243
Early stopping, best iteration is:
[198]	training's multi_error: 0.0410476	valid_1's multi_error: 0.0809208
Memory usage of dataframe is 3837.77 MB
Memory usage after optimization is: 942.31 MB
Decreased by 75.4%
Done test read
                id         A         B         C         D
0                0  0.951522  0.003137  0.039142  0.006199
1                1  0.948906  0.004076  0.040003  0.007014
2                2  0.952323  0.003158  0.038301  0.006218
3                3  0.948901  0.004082 