In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import math

In [2]:
# Getting the number of unique IP addresses per country appearing in the dataset df and adding the `country` column...
def duplicate_filter(df):
    for rows in range(len(df)):
        new_val = df.iloc[rows]['geo']
        if new_val:
            df.at[rows,'country'] = new_val['c']
        else:
            df.at[rows,'country'] = ''
    df_unique = df.drop_duplicates(subset='ip')
    per_country_counts = df_unique.country.value_counts()
    return (df_unique, per_country_counts)

In [3]:
# cleaning the banner of unwanted charecters...
def clean_banner(df):
    for rows in range(len(df)):
        df.at[rows,'banner'] = re.sub('(\\\\x(([a-z][a-z])|([0-9][0-9])|([0-9][a-z])|([a-z][0-9])))|\\r\\n', '', df.iloc[rows]['banner']).lstrip()
        
    return df

In [4]:
# Counting routers...
def count_routers(df):
    count_router = 0
    count_server = 0
    router = "router"
    for rows in range(len(df)):
        if router in df.iloc[rows]["banner"].lower():
            count_router+=1
        else:
            count_server+=1
    return (count_router, count_server)

In [5]:
#Searching banners for specific implementations given in the `keyword` list and counting their nos. for each country...
def country_banner_filter(df):
    keywords = ['cisco','d-link','freebsd','jetdirect','netgear','solaris']
#     keywords_chn = {'cisco':0,'d-link':0,'freebsd':0,'jetdirect':0,'netgear':0,'solaris':0}
#     keywords_usa = {'cisco':0,'d-link':0,'freebsd':0,'jetdirect':0,'netgear':0,'solaris':0}
#     keywords_kor = {'cisco':0,'d-link':0,'freebsd':0,'jetdirect':0,'netgear':0,'solaris':0}
#     potential_ips = {'CHN':[], 'USA':[], 'KOR':[]}
    potential_ips = {'DEU':[], 'NLD':[], 'GBR':[]}
    df = df[df.country.isin(list(potential_ips.keys()))]
    comp_flag = 0
    for rows in range(len(df)):
        country = df.iloc[rows]['country']
        if country in potential_ips.keys():
            banner_info = df.iloc[rows]['banner'].lower()
            for i in range(len(keywords)):
                keyword = keywords[i]
                if keyword in banner_info:
                    potential_ips[country].append(df.iloc[rows]['ip'])
                    #sources[country].add(keyword)
#                     if country == 'CHN':
#                         keywords_chn[keyword]+=1
#                     elif country == 'USA':
#                         keywords_usa[keyword]+=1
#                     elif country == 'KOR':
#                         keywords_kor[keyword]+=1
                    i = len(keywords)
                               
#     return (keywords_chn, keywords_usa, keywords_kor)
    return (potential_ips)

In [6]:
def extract_banner_wise_data(filename):
    print("Extracting banner data for", filename)
    df = pd.read_json(filename, lines=True)
    print('Filtering countries and duplicates...')
    (df, country_stats) = duplicate_filter(df)
    print('Scanning banners...')
#     (keywords_chn, keywords_usa, keywords_kor) = country_banner_filter(df)
    (potential_ips) = country_banner_filter(df)
#     return (df, country_stats, keywords_chn, keywords_usa, keywords_kor)
    return (df, country_stats, potential_ips)

In [7]:
def extract_ips(filename):
#     print("Reading file", filename)
    df = pd.read_json(filename, lines=True)
    print('Processing ', filename)
    (df, country_stats) = duplicate_filter(df)
    return df, country_stats

In [8]:
# df_mar13_telnet, country_stats_mar13_telnet = extract_ips("../Data files/critical_201303_23.json")
# df_mar13_ssh, country_stats_mar13_ssh = extract_ips("../Data files/critical_201303_22.json")

# df_jan13_telnet, country_stats_jan13_telnet = extract_ips("../Data files/critical_201301_23.json")
# df_jan13_ssh, country_stats_jan13_ssh = extract_ips("../Data files/critical_201301_22.json")

# df_feb13_telnet, country_stats_feb13_telnet = extract_ips("../Data files/critical_201302_23.json")
# df_feb13_ssh, country_stats_feb13_ssh = extract_ips("../Data files/critical_201302_22.json")

# df_dec12_telnet, country_stats_dec12_telnet = extract_ips("../Data files/critical_201212_23.json")
# df_dec12_ssh, country_stats_dec12_ssh = extract_ips("../Data files/critical_201212_22.json")

# df_nov12_telnet, country_stats_nov12_telnet = extract_ips("../Data files/critical_201211_23.json")
# df_nov12_ssh, country_stats_nov12_ssh = extract_ips("../Data files/critical_201211_22.json")

# df_may12_telnet, country_stats_may12_telnet = extract_ips("../Data files/critical_201205_23.json")
# df_may12_ssh, country_stats_may12_ssh = extract_ips("../Data files/critical_201205_22.json")


df_aug12_telnet, country_stats_aug12_telnet = extract_ips("../Data files/critical_201208_23.json")
df_aug12_ssh, country_stats_aug12_ssh = extract_ips("../Data files/critical_201208_22.json")

Processing  ../Data files/critical_201208_23.json
Processing  ../Data files/critical_201208_22.json


In [9]:
def get_ratios(country_stats_telnet, country_stats_ssh, countries=['NLD', 'DEU', 'GBR', 'USA', 'AUS', 'CAN', 'FIN', 'FRA', 'DNK']):
    for country in countries:
        print(country, (country_stats_telnet[country] / (country_stats_ssh[country] + country_stats_telnet[country])))

In [10]:
def get_nos(country_stats_telnet, countries=['NLD', 'DEU', 'GBR', 'USA', 'AUS', 'CAN', 'FIN', 'FRA', 'DNK']):
    for country in countries:
        print(country, country_stats_telnet[country])

In [11]:
get_ratios(country_stats_aug12_telnet, country_stats_aug12_ssh)
# get_nos(country_stats_may12_telnet)

NLD 0.16277154981316105
DEU 0.1453117467009003
GBR 0.28449654315590883
USA 0.24275285516377446
AUS 0.3889538661468486
CAN 0.3428277007423415
FIN 0.19080906148867313
FRA 0.3022993178840855
DNK 0.25638482519216466
