In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
# dataset with 10,000 rows and 6 columns of random data between 0 and 100 (inclusive) and last column is a random number intiger between 0 and 1 
df = pd.DataFrame(np.random.randint(0, 100, size=(10000, 6)), columns=['a', 'b', 'c', 'd', 'e', 'f'])
df['g'] = np.random.randint(0, 2, size=10000)

In [4]:
df.columns = [ 'var_' + str(i) for i in range(1, 7) ] + ['failure']

In [5]:
# random value between 0 and 1 with chance of 0.1 of being 1
df['failure'] = np.random.binomial(1, 0.1, size=10000)

In [6]:
# when failure is 1, select 2 rows before and 1 after and save them to a new dataframe

df_failure = pd.DataFrame()
for i in range(0, len(df)):
    if df.iloc[i]['failure'] == 1:
        df_failure = df_failure.append(df.iloc[i-2:i+2])


In [7]:
df.head(20)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,failure
0,55,69,89,67,8,82,0
1,11,19,18,83,38,47,0
2,66,35,72,44,44,30,0
3,96,60,36,77,8,12,0
4,0,37,99,81,17,42,1
5,67,10,8,63,24,32,0
6,56,7,37,10,90,32,0
7,75,43,69,78,47,55,0
8,28,79,42,6,67,21,0
9,39,34,73,63,6,97,0


In [8]:
# calculate the set difference between the original dataframe and the new dataframe with the failure rows
df_diff = df.drop(df_failure.index)
df_diff.head()


Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,failure
0,55,69,89,67,8,82,0
1,11,19,18,83,38,47,0
6,56,7,37,10,90,32,0
7,75,43,69,78,47,55,0
8,28,79,42,6,67,21,0


In [9]:
df_failure.head(50)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,failure
2,66,35,72,44,44,30,0
3,96,60,36,77,8,12,0
4,0,37,99,81,17,42,1
5,67,10,8,63,24,32,0
15,47,29,21,4,9,62,0
16,62,37,31,99,39,17,0
17,35,39,16,84,30,34,1
18,21,80,19,1,94,94,0
25,58,57,97,14,68,83,0
26,81,72,5,33,85,52,0


In [19]:
# open csv file that are in the data folder
df_time = pd.read_csv('data/fb_week_of_may_20_per_minute.csv', index_col='date', parse_dates=True, 
    date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d %H-%M'))

In [52]:
df_sample_time = df_time.iloc[0:50].reset_index()
df_sample_time

Unnamed: 0,date,open,high,low,close,volume
0,2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0
1,2019-05-20 09:31:00,182.61,182.61,182.61,182.61,468017.0
2,2019-05-20 09:32:00,182.7458,182.7458,182.7458,182.7458,97258.0
3,2019-05-20 09:33:00,182.95,182.95,182.95,182.95,43961.0
4,2019-05-20 09:34:00,183.06,183.06,183.06,183.06,79562.0
5,2019-05-20 09:35:00,183.05,183.05,183.05,183.05,68116.0
6,2019-05-20 09:36:00,182.6,182.6,182.6,182.6,62710.0
7,2019-05-20 09:37:00,182.4,182.4,182.4,182.4,49433.0
8,2019-05-20 09:38:00,182.25,182.25,182.25,182.25,52004.0
9,2019-05-20 09:39:00,182.3,182.3,182.3,182.3,88804.0


In [53]:
# 10 sample of df_sample_time
df_merge = df_sample_time.sample(10).reset_index(drop=True)
# drop last 4 columns
df_merge = df_merge.drop(df_merge.columns[-4:], axis=1)
df_merge['open'] = np.random.randint(0, 2, size=10)
# rename columns
df_merge.columns = ['date', 'failure']

In [55]:
# add 30 seconds to each date
df_merge['date'] = df_merge['date'] + pd.Timedelta(seconds=31)
# sort dataframe by date
df_merge.sort_values(by='date', inplace=True)

In [57]:
df_merge.reset_index(drop=True)

Unnamed: 0,date,failure
0,2019-05-20 09:38:02,1
1,2019-05-20 09:48:02,0
2,2019-05-20 09:57:02,0
3,2019-05-20 09:58:02,0
4,2019-05-20 10:00:02,1
5,2019-05-20 10:02:02,1
6,2019-05-20 10:06:02,0
7,2019-05-20 10:09:02,0
8,2019-05-20 10:17:02,0
9,2019-05-20 10:19:02,1


In [63]:
#merge asof the two dataframes by date
pd.merge_asof(df_sample_time, df_merge, on='date', direction='nearest', tolerance=pd.Timedelta(seconds=1))


Unnamed: 0,date,open,high,low,close,volume,failure
0,2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0,
1,2019-05-20 09:31:00,182.61,182.61,182.61,182.61,468017.0,
2,2019-05-20 09:32:00,182.7458,182.7458,182.7458,182.7458,97258.0,
3,2019-05-20 09:33:00,182.95,182.95,182.95,182.95,43961.0,
4,2019-05-20 09:34:00,183.06,183.06,183.06,183.06,79562.0,
5,2019-05-20 09:35:00,183.05,183.05,183.05,183.05,68116.0,
6,2019-05-20 09:36:00,182.6,182.6,182.6,182.6,62710.0,
7,2019-05-20 09:37:00,182.4,182.4,182.4,182.4,49433.0,
8,2019-05-20 09:38:00,182.25,182.25,182.25,182.25,52004.0,
9,2019-05-20 09:39:00,182.3,182.3,182.3,182.3,88804.0,


In [67]:
pd.merge_ordered(df_sample_time, df_merge)

Unnamed: 0,date,open,high,low,close,volume,failure
0,2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0,
1,2019-05-20 09:31:00,182.61,182.61,182.61,182.61,468017.0,
2,2019-05-20 09:32:00,182.7458,182.7458,182.7458,182.7458,97258.0,
3,2019-05-20 09:33:00,182.95,182.95,182.95,182.95,43961.0,
4,2019-05-20 09:34:00,183.06,183.06,183.06,183.06,79562.0,
5,2019-05-20 09:35:00,183.05,183.05,183.05,183.05,68116.0,
6,2019-05-20 09:36:00,182.6,182.6,182.6,182.6,62710.0,
7,2019-05-20 09:37:00,182.4,182.4,182.4,182.4,49433.0,
8,2019-05-20 09:38:00,182.25,182.25,182.25,182.25,52004.0,
9,2019-05-20 09:38:02,,,,,,1.0


In [73]:
df_test = pd.merge_ordered(df_sample_time, df_merge)

In [74]:
# shift the column failure by 1
df_test['failure'] = df_test['failure'].shift(-1)

In [77]:
# drop the rows with nan values in the open column
df_test = df_test.dropna(subset=['open'])
df_test
# replace the nan values in the failure column with 0
df_test['failure'] = df_test['failure'].fillna(0)
df_test


Unnamed: 0,date,open,high,low,close,volume,failure
0,2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0,0.0
1,2019-05-20 09:31:00,182.61,182.61,182.61,182.61,468017.0,0.0
2,2019-05-20 09:32:00,182.7458,182.7458,182.7458,182.7458,97258.0,0.0
3,2019-05-20 09:33:00,182.95,182.95,182.95,182.95,43961.0,0.0
4,2019-05-20 09:34:00,183.06,183.06,183.06,183.06,79562.0,0.0
5,2019-05-20 09:35:00,183.05,183.05,183.05,183.05,68116.0,0.0
6,2019-05-20 09:36:00,182.6,182.6,182.6,182.6,62710.0,0.0
7,2019-05-20 09:37:00,182.4,182.4,182.4,182.4,49433.0,0.0
8,2019-05-20 09:38:00,182.25,182.25,182.25,182.25,52004.0,1.0
10,2019-05-20 09:39:00,182.3,182.3,182.3,182.3,88804.0,0.0
