In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/all_portal_data.csv')

In [3]:
df.head()

Unnamed: 0,id,firsttime,lasttime,response,ip,as,domain,country,source,inetnum,netname
0,198137,2011-09-27 13:10:18,2012-03-25 08:17:41,dead,222.165.135.195,AS9329,wpc.gov.lk,LK,APNIC,222.165.135.0 - 222.165.135.255,CUS-LAN-SLT-LK
1,198193,2011-09-27 13:10:19,2013-06-24 17:22:49,dead,74.53.224.194,AS32613,hospitalsanbernabe.gov.co,US,ARIN,198.50.100.48 - 198.50.100.55,IWEB-CL-T118-490CL-1414
2,198221,2011-09-27 13:10:19,2014-01-26 19:16:56,dead,210.56.24.2,AS7590,un.org.pk,PK,APNIC,210.56.0.0 - 210.56.31.255,COMSATS
3,198341,2011-09-27 13:10:20,2013-02-22 04:40:33,dead,84.51.33.86,AS34104,espiyemeb.gov.tr,TR,RIPE,84.51.33.0 - 84.51.33.255,MARKUM_BILISIM_3
4,198382,2011-09-27 13:10:21,2012-03-03 12:38:32,dead,74.53.224.194,AS21844,hospitalsanbernabe.gov.co,US,ARIN,74.52.0.0 - 74.53.255.255,NETBLK-THEPLANET-BLK-14


In [4]:
df['firsttime'] = pd.to_datetime(df['firsttime'],format="%Y-%m-%d %H:%M:%S")
df['lasttime'] = pd.to_datetime(df['lasttime'],format="%Y-%m-%d %H:%M:%S")

static_df = df.copy()
# Latest day in dataset is 2017-04-07, so use 2017-04-08 as lasttime for alive connections.
latest_time = pd.to_datetime('2017-04-08 00:00:00')

# All rows except 2 start at 2011, so use this as split for some data cleaning
split_time = pd.to_datetime('2000-01-01 00:00:00')

# In the last month there is not enough data to conclude things from. 
break_time = pd.to_datetime('2017-04-01 00:00:00')

In [5]:
# Use the data up to march of 2017, because there is not enough data for april
df = df.loc[df['firsttime'] < break_time]

In [6]:
print(len(static_df.index) - len(df.index))

7000


In [7]:
# Remove 2 rows which have a starttime before 2000
df = df.loc[df['firsttime'] > split_time]

# Remove 2 rows which have an endtime before 2000 but have response closed
df = df.drop(index=[881412, 881413])

In [8]:
print(len(static_df.index) - len(df.index))

7004


In [9]:
def fix_lasttime(x):
    # For all alive connections, which do not have proper lasttime
    if x['lasttime'] < split_time:
        x['lasttime'] = latest_time
        return x
    else:
        return x

In [10]:
df = df.apply(fix_lasttime, axis=1)

In [11]:
# At this point every last time should be later than (or same as) the first time. So drop other rows
df = df.loc[df['firsttime'] <= df['lasttime']]

In [12]:
print(len(static_df.index) - len(df.index))

67695


In [13]:
df.to_csv('./data/all_portal_data_cleaned.csv')

In [14]:
df

Unnamed: 0,id,firsttime,lasttime,response,ip,as,domain,country,source,inetnum,netname
0,198137,2011-09-27 13:10:18,2012-03-25 08:17:41,dead,222.165.135.195,AS9329,wpc.gov.lk,LK,APNIC,222.165.135.0 - 222.165.135.255,CUS-LAN-SLT-LK
1,198193,2011-09-27 13:10:19,2013-06-24 17:22:49,dead,74.53.224.194,AS32613,hospitalsanbernabe.gov.co,US,ARIN,198.50.100.48 - 198.50.100.55,IWEB-CL-T118-490CL-1414
2,198221,2011-09-27 13:10:19,2014-01-26 19:16:56,dead,210.56.24.2,AS7590,un.org.pk,PK,APNIC,210.56.0.0 - 210.56.31.255,COMSATS
3,198341,2011-09-27 13:10:20,2013-02-22 04:40:33,dead,84.51.33.86,AS34104,espiyemeb.gov.tr,TR,RIPE,84.51.33.0 - 84.51.33.255,MARKUM_BILISIM_3
4,198382,2011-09-27 13:10:21,2012-03-03 12:38:32,dead,74.53.224.194,AS21844,hospitalsanbernabe.gov.co,US,ARIN,74.52.0.0 - 74.53.255.255,NETBLK-THEPLANET-BLK-14
...,...,...,...,...,...,...,...,...,...,...,...
940671,37300284,2017-03-16 19:46:09,2017-04-08 00:00:00,alive,198.136.54.34,AS33182,jelanifranklyndesign.com,US,ARIN,198.136.48.0 - 198.136.63.255,DIMENOC
940728,37311774,2017-02-22 17:24:25,2017-04-08 00:00:00,alive,209.135.140.125,AS6428,hbs.ma,US,ARIN,209.135.140.0 - 209.135.140.255,ROSEHOSTING-209-135-140
940743,37318154,2017-02-15 15:13:01,2017-04-08 00:00:00,alive,172.217.18.179,AS15169,almohtarifdz.com,US,ARIN,216.58.192.0 - 216.58.223.255,GOOGLE
940956,37370908,2016-12-23 15:34:29,2017-04-08 00:00:00,alive,198.136.54.34,AS33182,anansipublications.com,US,ARIN,198.136.48.0 - 198.136.63.255,DIMENOC
