In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import os

import random

import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.precision = 15

import gc
import warnings
warnings.filterwarnings("ignore")

from fastai.tabular import * 
from tqdm import tqdm_notebook
from fastai.callbacks import *

In [2]:
%%time
train = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32},nrows=6e8)

CPU times: user 2min 1s, sys: 10.1 s, total: 2min 11s
Wall time: 2min 12s


In [3]:
min = -100
max = 100
spread = 110
def get_counts(sequence):     
    counts = [0]*spread
    unique_count = np.unique(sequence, return_counts=True)
    for i in range(0,len(unique_count[0])):
        val = unique_count[0][i]
        count = unique_count[1][i]
        r = count*val
        if val <= min:
            counts[0] += r
        elif val >= max:
            counts[-1] += r
        else:
            counts[int(val/2)+int(spread/2)] += r

    return counts

In [4]:
interval = 75000
counts = [get_counts(train.acoustic_data.values[i:i+150000]) for i in tqdm_notebook(range(0,len(train),interval))]
ttfs = [train.time_to_failure.values[i] for i in range(0,len(train),interval)]
del train

labels = ["D"+str(i) for i in range(0,len(counts[0]))]

df = pd.DataFrame(counts, columns=labels)
ttf_df = pd.DataFrame(ttfs, columns=["expected"])
df = df.join(ttf_df)

HBox(children=(IntProgress(value=0, max=8000), HTML(value='')))




In [5]:
df.head(3)

Unnamed: 0,D0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,D16,D17,D18,D19,D20,D21,D22,D23,D24,D25,D26,D27,D28,D29,D30,D31,D32,D33,D34,D35,D36,D37,D38,D39,...,D71,D72,D73,D74,D75,D76,D77,D78,D79,D80,D81,D82,D83,D84,D85,D86,D87,D88,D89,D90,D91,D92,D93,D94,D95,D96,D97,D98,D99,D100,D101,D102,D103,D104,D105,D106,D107,D108,D109,expected
0,0,0,0,0,0,0,-98,0,-95,-92,-91,0,0,-85,-82,-161,-78,0,0,-146,0,-138,-67,-194,-63,-60,-233,-170,-271,-527,-100,-242,-373,-400,-213,-443,-271,-511,-553,-618,...,1399,1343,1059,1273,688,594,1021,560,437,455,525,489,396,176,422,252,65,265,137,141,218,75,76,156,80,82,85,344,89,0,93,0,0,98,0,0,0,0,206,1.469099998474121
1,-3169,0,0,0,0,0,-296,-194,-95,-93,-90,0,-348,-85,-411,-240,-158,-230,-223,-653,-425,-273,-201,-64,-250,-180,-235,-282,-326,-578,-759,-96,-93,-445,-550,-812,-536,-584,-725,-717,...,1855,1623,1604,1310,1177,723,666,697,630,407,473,219,621,352,362,375,321,133,274,842,437,672,228,314,481,83,0,87,265,543,371,379,193,98,0,0,0,0,5113,1.449998617172241
2,-3703,0,0,0,0,0,-296,-291,-190,-93,-90,-88,-348,-85,-411,-320,-158,-230,-223,-653,-495,-273,-201,-64,-376,-180,-293,-282,-326,-683,-809,-290,-233,-490,-719,-892,-688,-619,-966,-1303,...,2439,2101,1859,1730,1665,934,1288,1116,1068,356,473,438,792,587,602,375,321,200,479,913,437,672,228,548,481,83,84,87,354,543,371,568,386,196,0,0,0,0,5326,1.430797219276428


In [6]:
path ="../tmp"
try:
    os.makedirs(path)
except:
    pass

# Test Data

In [7]:
tpath = "../input/test"
files = os.listdir(tpath)
i = 0
test_id = []
test_df = pd.DataFrame(dtype=np.float64, columns=df.columns.values[:-1])
for f in tqdm_notebook(files):
    seg = pd.read_csv(f'{tpath}/{f}')
    converted = get_counts(seg.acoustic_data.values)
    test_df.loc[i] = converted
    test_id.append(f.replace(".csv", ""))
    i+=1

HBox(children=(IntProgress(value=0, max=2624), HTML(value='')))




In [8]:
num = len(df)
interval = int(num/100)
values = int(num/(5*100))
valid_idx = []
for i in range(0,len(df)-values,interval):
    for j in range(0,values-1):
        valid_idx.append(i+j)

In [9]:
valid_ttfs = np.array([df.iloc[i].expected for i in valid_idx])

In [10]:
data = TabularDataBunch.from_df(path, df, "expected", valid_idx=valid_idx, test_df=test_df, procs=[Normalize])
# data = TabularDataBunch.from_df(path, df, "expected", valid_idx=valid_idx, procs=[Normalize])

* spread 200 - 2.02
* spread 300 - 

In [11]:
%%time

best_learn = None
best_mae = 9999

for i in range(0, 99):
    learn = tabular_learner(data=data, layers=[200,100], metrics=mae, ps=0.5, y_range=(-1,15))
    learn.callbacks = [SaveModelCallback(learn, every='improvement', mode='min', name='best')]
    learn.fit_one_cycle(20, 1e-2)
    gc.collect()

    preds = learn.get_preds(DatasetType.Valid)[0].numpy().flatten()
    new_mae = np.abs(valid_ttfs-preds).mean()
    if new_mae < best_mae or not best_learn:
        best_learn = learn
        best_mae = new_mae
    print(f'Run {i} - Best MAE: {best_mae}')

Run 19 - Best MAE: 2.0077382553915184
CPU times: user 5min 6s, sys: 2min 33s, total: 7min 39s
Wall time: 8min 30s


# Submission

In [12]:
preds = best_learn.get_preds(DatasetType.Test)[0].numpy().flatten()

In [13]:
tpath = "../input/test"
files = os.listdir(tpath)
files = [f.replace(".csv","") for f in files]
files[:3]

['seg_0b082e', 'seg_9e7dff', 'seg_b6c10d']

In [14]:
results = pd.DataFrame({"seg_id":files, "time_to_failure":preds})
results.head()

Unnamed: 0,seg_id,time_to_failure
0,seg_0b082e,9.671259880065918
1,seg_9e7dff,3.582855701446533
2,seg_b6c10d,7.680974960327148
3,seg_4435bd,4.013928413391113
4,seg_c09a41,3.563020706176758


In [15]:
results.to_csv('submission.csv',index=False)