In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pyarrow.parquet as pq

import random
random.seed(42) # The answer
 
import os
import sys
import gc
import re

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import lightgbm as lgb

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedKFold
from sklearn import metrics

In [2]:
meta_train = pd.read_csv('../input/metadata_train.csv')
len(meta_train)

8712

In [3]:
# meta_train["target"].value_counts()

In [4]:
meta_train.tail()
# np.range(8712,20336)

Unnamed: 0,signal_id,id_measurement,phase,target
8707,8707,2902,1,0
8708,8708,2902,2,0
8709,8709,2903,0,0
8710,8710,2903,1,0
8711,8711,2903,2,0


In [5]:
len(meta_train)

8712

In [6]:
%%time
subset_train = pq.read_pandas('../input/train.parquet', columns=[str(i) for i in range(len(meta_train))]).to_pandas()

CPU times: user 40.6 s, sys: 16 s, total: 56.6 s
Wall time: 57.2 s


In [7]:
subset_train.shape

(800000, 8712)

In [8]:
%%time
train_length = 8712 
positive_length = len(meta_train[meta_train['target']==1])
train_df = pd.DataFrame()
row_index = 0

for i in range(train_length):
    # downsampling
    if meta_train.loc[i,'target'] == 1 or random.random() < positive_length / train_length:
        subset_train_row = subset_train[str(i)]
        train_df.loc[row_index, 'signal_min'] = np.min(subset_train_row)
        train_df.loc[row_index, 'signal_max'] = np.max(subset_train_row)
        train_df.loc[row_index, 'signal_mean'] = np.mean(subset_train_row)
        train_df.loc[row_index, 'signal_mean_sq'] = np.mean(subset_train_row)**2
        train_df.loc[row_index, 'signal_max_min_diff'] = np.subtract(np.max(subset_train_row),np.min(subset_train_row))
#         train_df.loc[row_index, 'signal_median'] = np.median(subset_train_row)
#         train_df.loc[row_index, 'signal_ptp'] = np.ptp(subset_train_row)
        
        train_df.loc[row_index, 'signal_id'] = i
        row_index += 1
        
print("positive length: " + str(positive_length))

print("train length: " + str(len(train_df)))

positive length: 525
train length: 976
CPU times: user 36.6 s, sys: 4 ms, total: 36.6 s
Wall time: 36.6 s


In [9]:
train_df = pd.merge(train_df, meta_train, on='signal_id')
train_df.to_csv("train.csv", index=False)
train_df.head()

Unnamed: 0,signal_min,signal_max,signal_mean,signal_mean_sq,signal_max_min_diff,signal_id,id_measurement,phase,target
0,-30.0,26.0,-0.194125,0.037685,56.0,1.0,0,1,0
1,-77.0,52.0,-0.997401,0.994809,-127.0,3.0,1,0,1
2,-40.0,33.0,-0.175586,0.030831,73.0,4.0,1,1,1
3,-43.0,58.0,-0.036004,0.001296,101.0,5.0,1,2,1
4,-98.0,127.0,-1.011068,1.022257,-31.0,12.0,4,0,0


In [10]:
%%time
meta_test = pd.read_csv('../input/metadata_test.csv')

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 11.7 ms


In [11]:
%%time
test_df = pd.DataFrame()
row_index = 0
for i in range(10):
    subset_test = pq.read_pandas('../input/test.parquet', columns=[str(i*2000 + j + 8712) for j in range(2000)]).to_pandas()
    for j in range(2000):
        subset_test_row = subset_test[str(i*2000 + j + 8712)]
        test_df.loc[row_index, 'signal_min'] = np.mean(subset_test_row)
        test_df.loc[row_index, 'signal_max'] = np.max(subset_test_row)
        test_df.loc[row_index, 'signal_mean'] = np.mean(subset_test_row)
        test_df.loc[row_index, 'signal_mean_sq'] = np.mean(subset_test_row)**2
        test_df.loc[row_index, 'signal_max_min_diff'] = np.subtract(np.max(subset_test_row),np.min(subset_test_row))
#         test_df.loc[row_index, 'signal_median'] = np.median(subset_test_row)
#         test_df.loc[row_index, 'signal_ptp'] = np.ptp(subset_test_row)
        test_df.loc[row_index, 'signal_id'] = i*2000 + j + 8712
        row_index += 1
subset_test = pq.read_pandas('../input/test.parquet', columns=[str(i + 28712) for i in range(337)]).to_pandas()
for i in tqdm(range(337)):
    subset_test_row = subset_test[str(i + 28712)]
    test_df.loc[row_index, 'signal_min'] = np.min(subset_test_row)
    test_df.loc[row_index, 'signal_max'] = np.max(subset_test_row)
    test_df.loc[row_index, 'signal_mean'] = np.mean(subset_test_row)
    test_df.loc[row_index, 'signal_mean_sq'] = np.mean(subset_test_row)**2
    test_df.loc[row_index, 'signal_max_min_diff'] = np.subtract(np.max(subset_test_row),np.min(subset_test_row))
#     test_df.loc[row_index, 'signal_median'] = np.median(subset_test_row)
#     test_df.loc[row_index, 'signal_ptp'] = np.ptp(subset_test_row)
    test_df.loc[row_index, 'signal_id'] = i + 28712
    row_index += 1
test_df = pd.merge(test_df, meta_test, on='signal_id')
test_df.to_csv("test.csv", index=False)
test_df.head()

100%|██████████| 337/337 [00:12<00:00, 26.09it/s]


CPU times: user 12min 2s, sys: 22.8 s, total: 12min 25s
Wall time: 12min 26s


In [12]:
test_df.head()

Unnamed: 0,signal_min,signal_max,signal_mean,signal_mean_sq,signal_max_min_diff,signal_id,id_measurement,phase
0,-1.007639,28.0,-1.007639,1.015336,74.0,8712.0,2904,0
1,-0.178762,30.0,-0.178762,0.031956,55.0,8713.0,2904,1
2,-0.031567,31.0,-0.031567,0.000997,58.0,8714.0,2904,2
3,-0.021024,28.0,-0.021024,0.000442,57.0,8715.0,2905,0
4,-1.479971,29.0,-1.479971,2.190315,64.0,8716.0,2905,1
