# Preprocess/CleanUp

In [1]:
#imported python packages
import os
import math
import numpy as np 
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
import glob
import functools
import datetime

In [2]:
# import dataset about train dataset and sample_submission
train = pd.read_csv("train.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [3]:
train

Unnamed: 0,segment_id,time_to_eruption
0,1136037770,12262005
1,1969647810,32739612
2,1895879680,14965999
3,2068207140,26469720
4,192955606,31072429
...,...,...
4426,873340274,15695097
4427,1297437712,35659379
4428,694853998,31206935
4429,1886987043,9598270


In [4]:
sample_submission

Unnamed: 0,segment_id,time_to_eruption
0,1000213997,0
1,100023368,0
2,1000488999,0
3,1001028887,0
4,1001857862,0
...,...,...
4515,996704281,0
4516,997630809,0
4517,998072137,0
4518,998136924,0


In [5]:
# created dataframe about the testing volcanos id
test_data = pd.DataFrame([os.path.basename(f)[:-4] for f in glob.glob('test/*')], columns=["segment_id"])
test_data

Unnamed: 0,segment_id
0,19158067
1,146639315
2,53465103
3,638074752
4,106558646
...,...
725,1265147
726,115078097
727,577108378
728,108980580


In [6]:
# created dataframe about the training volcanos id
train_data = pd.DataFrame([os.path.basename(f)[:-4] for f in glob.glob('train/*')], columns=["segment_id"])
train_data

Unnamed: 0,segment_id
0,117864846
1,198187683
2,19334151
3,164402661
4,224267098
...,...
754,1912467905
755,140348256
756,442836719
757,88594602


In [7]:
# sorted values depending on the time_to_eruption
train = train.sort_values("time_to_eruption",ascending=False)
train

Unnamed: 0,segment_id,time_to_eruption
1139,1923243961,49046087
3509,1552761888,48814294
3589,356854390,48727004
3739,1131527270,48522449
4132,1162128945,48387290
...,...,...
1724,1626437563,40492
413,442994108,28696
1145,1957235969,26929
2709,1658693785,25730


In [8]:
# imported one of the training volcano dataset
df = pd.read_csv(f'train/{19334151}.csv')

In [9]:
df

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10
0,-110.0,,-41.0,13.0,149.0,392.0,-34.0,,95.0,264.0
1,-52.0,,-59.0,-17.0,79.0,972.0,-51.0,,59.0,280.0
2,25.0,,-89.0,-69.0,-2.0,912.0,-75.0,,14.0,362.0
3,81.0,,-103.0,-86.0,-47.0,328.0,-85.0,,-14.0,428.0
4,113.0,,-106.0,-86.0,-21.0,-91.0,-70.0,,-5.0,386.0
...,...,...,...,...,...,...,...,...,...,...
59996,163.0,,-134.0,185.0,80.0,-75.0,36.0,,574.0,-845.0
59997,102.0,,-98.0,162.0,56.0,-68.0,22.0,,559.0,-1022.0
59998,38.0,,-48.0,164.0,60.0,-62.0,-15.0,,535.0,-1228.0
59999,0.0,,-13.0,148.0,82.0,-56.0,-52.0,,509.0,-1324.0


### There are some NAN values for sensors 2 and 8. This might be true for some other volcano ids too.

In [10]:
# Converted time_to_eruption to comprehensible time
train['h:m:s'] = (train['time_to_eruption']
                  .apply(lambda x:datetime.timedelta(seconds = x/100)))
train

Unnamed: 0,segment_id,time_to_eruption,h:m:s
1139,1923243961,49046087,5 days 16:14:20.870000
3509,1552761888,48814294,5 days 15:35:42.940000
3589,356854390,48727004,5 days 15:21:10.040000
3739,1131527270,48522449,5 days 14:47:04.490000
4132,1162128945,48387290,5 days 14:24:32.900000
...,...,...,...
1724,1626437563,40492,0 days 00:06:44.920000
413,442994108,28696,0 days 00:04:46.960000
1145,1957235969,26929,0 days 00:04:29.290000
2709,1658693785,25730,0 days 00:04:17.300000


## Filling the NAN values and extracting features from senosor data.

In [11]:
fs = ["_mad","_skew","_kurt","_nunique",
      "_quantile_05","_quantile_10","_quantile_30","_quantile_70","_quantile_90","_quantile_95",
      "_fft_power_mean","_fft_power_std","_fft_power_min","_fft_power_max",
      "_fft_power_sum_low","_fft_power_sum_middle","_fft_power_sum_high",
      "_fft_power_mad","_fft_power_skew","_fft_power_kurt","_fft_power_nunique",
      "_fft_power_quantile_05","_fft_power_quantile_10","_fft_power_quantile_30","_fft_power_quantile_70",
      "_fft_power_quantile_90","_fft_power_quantile_95",
      "_roll_mean_min","_roll_mean_max","_roll_dist_min","_roll_dist_max","_roll_dist_diff_min","_roll_dist_diff_max"
     ]

def extract(segment_id):
    segment_id = str(segment_id)

    try:
        f = pd.read_csv(f'train/{segment_id}.csv')
    except FileNotFoundError:
        f = pd.read_csv(f'test/{segment_id}.csv')
    
    # Fill NaN
    f.interpolate(axis=0,inplace=True)
    
    # Quantile
    q005 = f.quantile(0.05,axis=0)
    q010 = f.quantile(0.1 ,axis=0)
    q030 = f.quantile(0.3 ,axis=0)
    q070 = f.quantile(0.7 ,axis=0)
    q090 = f.quantile(0.9 ,axis=0)
    q095 = f.quantile(0.95,axis=0)
    
    # Rolling
    roll = f.rolling(500)
    roll_mean = roll.mean()
    roll_max = roll.max()
    roll_min = roll.min()
    roll_dist = roll_max - roll_min
    roll_dist_diff = roll_dist.diff()

    # FFT 
    # Remove under flowed 0 frequency and mirrored higher half.
    fft = pd.DataFrame(np.fft.fft(f.fillna(0)),columns=f.columns).abs().iloc[1:30001,:]
    fft[f.iloc[1:30001,:].isnull()] = np.nan
        
    # Timing information inside 10 minute.
    f005 = f.where(f < q005)
    f010 = f.where(f < q010)
    f030 = f.where(f < q030)

    f070 = f.where(f > q070)
    f090 = f.where(f > q090)
    f095 = f.where(f > q095)
    
    f_abs = f.abs()
    f_abs_0250 = f_abs.where(f_abs >  250)
    f_abs_0500 = f_abs.where(f_abs >  500)
    f_abs_0750 = f_abs.where(f_abs >  750)
    f_abs_1000 = f_abs.where(f_abs > 1000)
    f_abs_1250 = f_abs.where(f_abs > 1250)
    f_abs_1500 = f_abs.where(f_abs > 1500)


    return pd.concat((f.mad(axis=0).add_suffix("_mad"),
                      f.skew(axis=0).add_suffix("_skew"),
                      f.kurt(axis=0).add_suffix("_kurt"),
                      f.nunique(axis=0).add_suffix("_nunique"),
                      q005.add_suffix("_quantile_05"),
                      q010.add_suffix("_quantile_10"),
                      q030.add_suffix("_quantile_30"),
                      q070.add_suffix("_quantile_70"),
                      q090.add_suffix("_quantile_90"),
                      q095.add_suffix("_quantile_95"),
                      fft.mean(axis=0).add_suffix("_fft_power_mean"),
                      fft.std(axis=0).add_suffix("_fft_power_std"),
                      fft.min(axis=0).add_suffix("_fft_power_min"),
                      fft.max(axis=0).add_suffix("_fft_power_max"),
                      fft.iloc[:10000,:].sum(axis=0).add_suffix("_fft_power_sum_low"),
                      fft.iloc[10000:20000,:].sum(axis=0).add_suffix("_fft_power_sum_middle"),
                      fft.iloc[20000:,:].sum(axis=0).add_suffix("_fft_power_sum_high"),
                      fft.mad(axis=0).add_suffix("_fft_power_mad"),
                      fft.skew(axis=0).add_suffix("_fft_power_skew"),
                      fft.kurt(axis=0).add_suffix("_fft_power_kurt"),
                      fft.nunique(axis=0).add_suffix("_fft_power_nunique"),
                      fft.quantile(0.05,axis=0).add_suffix("_fft_power_quantile_05"),
                      fft.quantile(0.1,axis=0).add_suffix("_fft_power_quantile_10"),
                      fft.quantile(0.3,axis=0).add_suffix("_fft_power_quantile_30"),
                      fft.quantile(0.7,axis=0).add_suffix("_fft_power_quantile_70"),
                      fft.quantile(0.9,axis=0).add_suffix("_fft_power_quantile_90"),
                      fft.quantile(0.95,axis=0).add_suffix("_fft_power_quantile_95"),
                      roll_mean.min(axis=0).add_suffix("_roll_mean_min"),
                      roll_mean.max(axis=0).add_suffix("_roll_mean_max"),
                      roll_dist.min(axis=0).add_suffix("_roll_dist_min"),
                      roll_dist.max(axis=0).add_suffix("_roll_dist_max"),
                      roll_dist_diff.min(axis=0).add_suffix("_roll_dist_diff_min"),
                      roll_dist_diff.max(axis=0).add_suffix("_roll_dist_diff_max"),
                      f005.idxmin().add_suffix("_first_005"),
                      f005.idxmax().add_suffix("_last_005"),
                      f010.idxmin().add_suffix("_first_010"),
                      f010.idxmax().add_suffix("_last_010"),
                      f030.idxmin().add_suffix("_first_030"),
                      f030.idxmax().add_suffix("_last_030"),
                      f070.idxmin().add_suffix("_first_070"),
                      f070.idxmax().add_suffix("_last_070"),
                      f090.idxmin().add_suffix("_first_090"),
                      f090.idxmax().add_suffix("_last_090"),
                      f095.idxmin().add_suffix("_first_095"),
                      f095.idxmax().add_suffix("_last_095"),
                      f_abs_0250.idxmin().add_suffix("_abs_0250_min"),
                      f_abs_0250.idxmax().add_suffix("_abs_0250_max"),
                      f_abs_0500.idxmin().add_suffix("_abs_0500_min"),
                      f_abs_0500.idxmax().add_suffix("_abs_0500_max"),
                      f_abs_0750.idxmin().add_suffix("_abs_0750_min"),
                      f_abs_0750.idxmax().add_suffix("_abs_0750_max"),
                      f_abs_1000.idxmin().add_suffix("_abs_1000_min"),
                      f_abs_1000.idxmax().add_suffix("_abs_1000_max"),
                      f_abs_1250.idxmin().add_suffix("_abs_1250_min"),
                      f_abs_1250.idxmax().add_suffix("_abs_1250_max"),
                      f_abs_1500.idxmin().add_suffix("_abs_1500_min"),
                      f_abs_1500.idxmax().add_suffix("_abs_1500_max"),
                     ),
                     axis=0)

In [12]:
# created a dataframe with the features of each volcano's sensor from training dataset using extract()
train_df = train_data['segment_id'].apply(extract)

In [13]:
train_df

Unnamed: 0,sensor_1_mad,sensor_2_mad,sensor_3_mad,sensor_4_mad,sensor_5_mad,sensor_6_mad,sensor_7_mad,sensor_8_mad,sensor_9_mad,sensor_10_mad,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
0,275.540219,281.290265,221.498216,216.719092,122.339457,469.572370,308.980466,431.047467,197.874719,486.458430,...,44085.0,32657.0,,33247.0,,45873.0,34962.0,36066.0,34085.0,44047.0
1,87.948116,310.816312,94.864267,171.857227,72.131218,409.403108,364.553265,221.288958,172.511835,184.137370,...,,39980.0,,,40086.0,44112.0,15332.0,,32422.0,40553.0
2,187.959902,,165.611696,212.479589,133.480412,227.752056,167.136098,,208.342955,415.565470,...,40764.0,,40699.0,40593.0,39862.0,558.0,44166.0,,40844.0,40929.0
3,264.168968,538.217302,199.940517,215.423867,175.932721,176.858434,192.017868,187.606417,246.006008,653.111162,...,8876.0,7956.0,,8590.0,8342.0,10997.0,,23810.0,8976.0,34230.0
4,197.459941,285.487212,205.923826,217.431964,114.416705,294.826514,355.885244,407.028378,236.599986,388.154537,...,59133.0,57619.0,58006.0,51749.0,57814.0,18321.0,13542.0,48850.0,58723.0,58045.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,298.485925,331.871027,,254.193576,131.244670,1216.254666,422.137610,589.684275,,520.040115,...,27874.0,12878.0,,13927.0,,14894.0,14807.0,31126.0,,2358.0
755,222.753393,434.957884,183.545977,207.295698,194.108824,175.420124,205.749167,187.805127,244.169657,575.004226,...,,49145.0,49809.0,49758.0,49134.0,,,,50008.0,50293.0
756,288.273109,203.501855,587.952974,197.239601,60.200193,490.085364,412.458957,492.627907,176.865123,274.909356,...,56713.0,34753.0,29072.0,,,35531.0,5399.0,28622.0,,30165.0
757,214.769684,313.800384,,170.838270,113.415398,369.350318,221.382195,233.942957,195.371928,388.218695,...,17173.0,59997.0,,17690.0,56646.0,58479.0,44385.0,,17117.0,16997.0


In [14]:
train_df.interpolate(axis=0,inplace=True)

In [15]:
train_df

Unnamed: 0,sensor_1_mad,sensor_2_mad,sensor_3_mad,sensor_4_mad,sensor_5_mad,sensor_6_mad,sensor_7_mad,sensor_8_mad,sensor_9_mad,sensor_10_mad,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
0,275.540219,281.290265,221.498216,216.719092,122.339457,469.572370,308.980466,431.047467,197.874719,486.458430,...,44085.0,32657.0,,33247.0,,45873.0,34962.0,36066.000000,34085.0,44047.0
1,87.948116,310.816312,94.864267,171.857227,72.131218,409.403108,364.553265,221.288958,172.511835,184.137370,...,42424.5,39980.0,,36920.0,40086.0,44112.0,15332.0,31980.666667,32422.0,40553.0
2,187.959902,424.516807,165.611696,212.479589,133.480412,227.752056,167.136098,204.447688,208.342955,415.565470,...,40764.0,23968.0,40699.0,40593.0,39862.0,558.0,44166.0,27895.333333,40844.0,40929.0
3,264.168968,538.217302,199.940517,215.423867,175.932721,176.858434,192.017868,187.606417,246.006008,653.111162,...,8876.0,7956.0,49352.5,8590.0,8342.0,10997.0,28854.0,23810.000000,8976.0,34230.0
4,197.459941,285.487212,205.923826,217.431964,114.416705,294.826514,355.885244,407.028378,236.599986,388.154537,...,59133.0,57619.0,58006.0,51749.0,57814.0,18321.0,13542.0,48850.000000,58723.0,58045.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,298.485925,331.871027,198.614252,254.193576,131.244670,1216.254666,422.137610,589.684275,227.382512,520.040115,...,27874.0,12878.0,46336.2,13927.0,45682.2,14894.0,14807.0,31126.000000,36254.5,2358.0
755,222.753393,434.957884,183.545977,207.295698,194.108824,175.420124,205.749167,187.805127,244.169657,575.004226,...,42293.5,49145.0,49809.0,49758.0,49134.0,25212.5,10103.0,29874.000000,50008.0,50293.0
756,288.273109,203.501855,587.952974,197.239601,60.200193,490.085364,412.458957,492.627907,176.865123,274.909356,...,56713.0,34753.0,29072.0,33724.0,52890.0,35531.0,5399.0,28622.000000,33562.5,30165.0
757,214.769684,313.800384,428.644739,170.838270,113.415398,369.350318,221.382195,233.942957,195.371928,388.218695,...,17173.0,59997.0,30625.5,17690.0,56646.0,58479.0,44385.0,19542.000000,17117.0,16997.0


In [16]:
# joined train_df with the train dataset
train_df = train.join(train_df)

In [17]:
train_df

Unnamed: 0,segment_id,time_to_eruption,h:m:s,sensor_1_mad,sensor_2_mad,sensor_3_mad,sensor_4_mad,sensor_5_mad,sensor_6_mad,sensor_7_mad,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
1139,1923243961,49046087,5 days 16:14:20.870000,,,,,,,,...,,,,,,,,,,
3509,1552761888,48814294,5 days 15:35:42.940000,,,,,,,,...,,,,,,,,,,
3589,356854390,48727004,5 days 15:21:10.040000,,,,,,,,...,,,,,,,,,,
3739,1131527270,48522449,5 days 14:47:04.490000,,,,,,,,...,,,,,,,,,,
4132,1162128945,48387290,5 days 14:24:32.900000,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,1626437563,40492,0 days 00:06:44.920000,,,,,,,,...,,,,,,,,,,
413,442994108,28696,0 days 00:04:46.960000,297.945463,366.545544,273.04305,267.871366,82.775953,377.133225,497.667999,...,28322.5,12372.0,18943.666667,6870.5,12725.0,57079.0,49155.0,36264.0,13074.0,13146.0
1145,1957235969,26929,0 days 00:04:29.290000,,,,,,,,...,,,,,,,,,,
2709,1658693785,25730,0 days 00:04:17.300000,,,,,,,,...,,,,,,,,,,


In [18]:
# drop NAN because of limited data available due to storage issues. These NAN values belong to the dataset not downloaded from Kaggle
train_df = train_df.dropna()

In [19]:
train_df.reset_index(drop=True,inplace=True)
train_df

Unnamed: 0,segment_id,time_to_eruption,h:m:s,sensor_1_mad,sensor_2_mad,sensor_3_mad,sensor_4_mad,sensor_5_mad,sensor_6_mad,sensor_7_mad,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
0,1064935255,47950543,5 days 13:11:45.430000,427.776378,623.697210,341.073573,433.849701,289.211797,764.413905,44.183687,...,22922.0,21691.0,10264.000000,9915.0,46270.0,23693.0,47421.0,1260.0,9606.0,22251.0
1,1323724621,47861879,5 days 12:56:58.790000,257.130834,471.174757,215.762815,219.545632,141.919199,640.298186,261.960275,...,27299.0,25259.0,26185.000000,25994.0,25354.0,50804.0,19320.0,55973.0,25736.0,25739.0
2,1581535482,47810569,5 days 12:48:25.690000,493.354956,207.232992,171.991627,284.738940,79.524836,1770.305228,467.666062,...,8187.0,56897.0,35346.400000,8595.0,12648.0,57834.0,8640.0,9252.0,8909.0,38587.0
3,594908465,47775708,5 days 12:42:37.080000,518.765458,705.686148,432.852295,452.531715,179.587543,1270.945471,44.843274,...,42362.0,40367.0,41805.000000,2081.0,50579.0,11868.0,38611.0,49984.0,41854.0,51499.0
4,995669347,47715187,5 days 12:32:31.870000,177.028725,224.951322,282.077704,211.051018,79.563309,623.316076,388.258728,...,45312.0,57382.0,22462.000000,16902.0,17995.6,29837.0,39494.0,36672.0,22371.5,53085.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,82371649,273119,0 days 00:45:31.190000,632.813205,874.526694,675.669753,646.072554,237.954403,580.591656,553.386559,...,24752.0,46245.0,49642.000000,49571.0,18261.0,31259.0,50917.0,36587.0,52567.0,18859.0
753,1301215104,223909,0 days 00:37:19.090000,431.525120,703.002077,466.305727,330.091944,275.156617,283.346580,506.428935,...,39574.0,59996.0,31919.000000,13284.0,18709.0,1800.0,58188.0,9677.0,39845.0,38291.0
754,372844561,179818,0 days 00:29:58.180000,579.081600,842.940727,555.487845,549.747417,177.285175,1649.068075,799.161800,...,53022.0,54602.0,10317.000000,56921.0,22012.2,46310.0,57185.0,32599.0,55057.0,10479.0
755,442994108,28696,0 days 00:04:46.960000,297.945463,366.545544,273.043050,267.871366,82.775953,377.133225,497.667999,...,28322.5,12372.0,18943.666667,6870.5,12725.0,57079.0,49155.0,36264.0,13074.0,13146.0


In [20]:
# dropped the h:m:s column for better model
train_df = train_df.drop(columns=["h:m:s"])

In [21]:
train_df

Unnamed: 0,segment_id,time_to_eruption,sensor_1_mad,sensor_2_mad,sensor_3_mad,sensor_4_mad,sensor_5_mad,sensor_6_mad,sensor_7_mad,sensor_8_mad,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
0,1064935255,47950543,427.776378,623.697210,341.073573,433.849701,289.211797,764.413905,44.183687,68.358305,...,22922.0,21691.0,10264.000000,9915.0,46270.0,23693.0,47421.0,1260.0,9606.0,22251.0
1,1323724621,47861879,257.130834,471.174757,215.762815,219.545632,141.919199,640.298186,261.960275,482.226406,...,27299.0,25259.0,26185.000000,25994.0,25354.0,50804.0,19320.0,55973.0,25736.0,25739.0
2,1581535482,47810569,493.354956,207.232992,171.991627,284.738940,79.524836,1770.305228,467.666062,688.694534,...,8187.0,56897.0,35346.400000,8595.0,12648.0,57834.0,8640.0,9252.0,8909.0,38587.0
3,594908465,47775708,518.765458,705.686148,432.852295,452.531715,179.587543,1270.945471,44.843274,66.518934,...,42362.0,40367.0,41805.000000,2081.0,50579.0,11868.0,38611.0,49984.0,41854.0,51499.0
4,995669347,47715187,177.028725,224.951322,282.077704,211.051018,79.563309,623.316076,388.258728,484.273435,...,45312.0,57382.0,22462.000000,16902.0,17995.6,29837.0,39494.0,36672.0,22371.5,53085.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,82371649,273119,632.813205,874.526694,675.669753,646.072554,237.954403,580.591656,553.386559,555.799454,...,24752.0,46245.0,49642.000000,49571.0,18261.0,31259.0,50917.0,36587.0,52567.0,18859.0
753,1301215104,223909,431.525120,703.002077,466.305727,330.091944,275.156617,283.346580,506.428935,316.107505,...,39574.0,59996.0,31919.000000,13284.0,18709.0,1800.0,58188.0,9677.0,39845.0,38291.0
754,372844561,179818,579.081600,842.940727,555.487845,549.747417,177.285175,1649.068075,799.161800,1121.988671,...,53022.0,54602.0,10317.000000,56921.0,22012.2,46310.0,57185.0,32599.0,55057.0,10479.0
755,442994108,28696,297.945463,366.545544,273.043050,267.871366,82.775953,377.133225,497.667999,607.939568,...,28322.5,12372.0,18943.666667,6870.5,12725.0,57079.0,49155.0,36264.0,13074.0,13146.0


In [22]:
# created a dataframe with the features of each volcano's sensor from test dataset using extract()
test_df = test_data['segment_id'].apply(extract)

In [23]:
test_df

Unnamed: 0,sensor_1_mad,sensor_2_mad,sensor_3_mad,sensor_4_mad,sensor_5_mad,sensor_6_mad,sensor_7_mad,sensor_8_mad,sensor_9_mad,sensor_10_mad,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
0,423.425102,1323.638698,386.641295,498.451395,,530.542895,331.684697,288.366121,483.131258,818.184432,...,18268.0,44667.0,45455.0,4432.0,,24308.0,15208.0,4573.0,45085.0,44999.0
1,185.211646,,153.577141,190.283720,,367.303473,261.497773,306.381147,179.474451,0.000000,...,1239.0,,,,,48908.0,,,,
2,268.929478,845.652027,249.931899,307.519773,208.797820,412.618668,295.700384,445.645529,329.119838,562.626086,...,14641.0,13749.0,12852.0,12974.0,14099.0,2082.0,2377.0,58944.0,14338.0,14424.0
3,,470.681350,2796.724210,753.785456,,1859.525892,954.842023,1269.317627,,,...,,7457.0,59386.0,24476.0,,4031.0,36807.0,39762.0,,
4,5271.735466,7949.787591,5199.150380,5336.574388,1950.105024,2370.015884,2569.982972,2335.088533,6442.976936,6974.975428,...,44547.0,265.0,6206.0,38681.0,32700.0,161.0,33822.0,44454.0,41125.0,4620.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,,382.821071,1316.548666,284.283004,,373.101000,255.202518,273.068444,,,...,,40177.0,11203.0,21320.0,,41686.0,45020.0,41715.0,,
726,476.088899,575.937473,390.164140,408.668268,138.693181,1323.645391,42.084348,51.654795,407.284503,660.883861,...,53321.0,51729.0,53249.0,53046.0,,12000.0,49137.0,49137.0,53258.0,52472.0
727,,355.749551,1079.560395,330.765043,,427.875003,238.444549,282.552092,,,...,,44448.0,33790.0,37123.0,,532.0,,,,
728,190.056032,,164.081338,222.079556,155.374275,204.583304,169.741851,,224.059314,436.934665,...,6574.0,,6325.0,6402.0,5803.0,,7510.0,,6158.0,5885.0


In [24]:
# joined test_df with the test_data dataset
test_df = test_data.join(test_df)

In [25]:
test_df.interpolate(axis=0,inplace=True)

In [26]:
test_df.dropna(inplace=True)

In [27]:
test_df

Unnamed: 0,segment_id,sensor_1_mad,sensor_2_mad,sensor_3_mad,sensor_4_mad,sensor_5_mad,sensor_6_mad,sensor_7_mad,sensor_8_mad,sensor_9_mad,...,sensor_1_abs_1500_max,sensor_2_abs_1500_max,sensor_3_abs_1500_max,sensor_4_abs_1500_max,sensor_5_abs_1500_max,sensor_6_abs_1500_max,sensor_7_abs_1500_max,sensor_8_abs_1500_max,sensor_9_abs_1500_max,sensor_10_abs_1500_max
2,53465103,268.929478,845.652027,249.931899,307.519773,208.797820,412.618668,295.700384,445.645529,329.119838,...,14641.0,13749.0,12852.0,12974.0,14099.000000,2082.0,2377.0,58944.000000,14338.0,14424.0
3,638074752,2770.332472,470.681350,2796.724210,753.785456,1079.451422,1859.525892,954.842023,1269.317627,3386.048387,...,29594.0,7457.0,59386.0,24476.0,23399.500000,4031.0,36807.0,39762.000000,27731.5,9522.0
4,106558646,5271.735466,7949.787591,5199.150380,5336.574388,1950.105024,2370.015884,2569.982972,2335.088533,6442.976936,...,44547.0,265.0,6206.0,38681.0,32700.000000,161.0,33822.0,44454.000000,41125.0,4620.0
5,12479934,367.796127,301.395992,2732.669633,277.582783,112.783708,1469.755578,396.360013,646.696491,289.958741,...,26626.0,38826.0,19095.0,13702.0,21122.000000,38912.0,30490.0,21142.000000,59707.0,26512.0
6,580521026,274.803808,349.466766,266.188887,257.445832,138.111164,564.599431,380.530281,619.309145,268.149731,...,9967.0,9451.0,31984.0,10376.0,9544.000000,12314.0,33097.0,33731.000000,10111.0,31950.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,1265147,316.318664,382.821071,1316.548666,284.283004,146.391247,373.101000,255.202518,273.068444,345.852474,...,40059.5,40177.0,11203.0,21320.0,3346.500000,41686.0,45020.0,41715.000000,40238.5,36948.0
726,115078097,476.088899,575.937473,390.164140,408.668268,138.693181,1323.645391,42.084348,51.654795,407.284503,...,53321.0,51729.0,53249.0,53046.0,4165.333333,12000.0,49137.0,49137.000000,53258.0,52472.0
727,577108378,333.072465,355.749551,1079.560395,330.765043,147.033728,427.875003,238.444549,282.552092,315.671909,...,29947.5,44448.0,33790.0,37123.0,4984.166667,532.0,28323.5,37714.333333,29708.0,29178.5
728,108980580,190.056032,355.749551,164.081338,222.079556,155.374275,204.583304,169.741851,325.657581,224.059314,...,6574.0,44448.0,6325.0,6402.0,5803.000000,6441.5,7510.0,26291.666667,6158.0,5885.0


In [28]:
# Saved the dataframe as a csv file
train_df.to_csv('train_df.csv',index=False)

In [29]:
# Saved the dataframe as a csv file
test_df.to_csv('test_df.csv',index=False)