In [1]:
import pandas as pd
import pprint 
import re
from IPython.display import clear_output,display,Markdown

#markdown for bold text
def printmd(string):
    display(Markdown(string))


In [2]:
#Most costliest operation; Don't run it muliple times unless there is a change input file 
#due to constraints on processing power,using a sample of approx 65k contracts
log = pd.read_csv('sample_log', sep="\r", names=['Full_text'],encoding='latin1')

#trimming the column
log['Full_text'] = log['Full_text'].str.strip()


In [3]:
#breaking the log text using regex expression
log[['Source_ip', 'Time_stamp', 'Request_type','Resource_path','Status_code','Response_size_in_bytes']] = log.Full_text.str.split('; | - - \[|\] "| /|" | \d',expand=True) 

#Filling zeros where Response_size_in_bytes is blank
log[log.eq('')] =0

#handling cases where Response_size_in_bytes has -
log["Status_code"]= log["Status_code"].str.split("-", n = 1, expand = True)[0]

#converting status_code into numeric
log['Status_code'] = pd.to_numeric(log['Status_code'])

#Broken total dataframe into two pieces based on response status
not_success_log = log[log['Status_code'] >= 400]
success_log=log[log['Status_code'] <= 399]

#log.head()

In [4]:
# function to get top 10 most requested resources / pages
def top_10_most_requested_resources():
    #to convert this into a user input in later version
    n = 10
    printmd ("**Top 10 accessed resources:**")
    pprint.pprint(log['Resource_path'].value_counts()[:n])
    cli()
    return

In [5]:
#Function to calculate percentage of successful and unsuccessful requests
def Percentage_of_successful_and_unsuccessful_requests():
    
    success = (len(success_log.index)/(len(success_log.index)+len(not_success_log.index)))*100
    not_success = (len(not_success_log.index)/(len(success_log.index)+len(not_success_log.index)))*100

    
    printmd ("**Percentage of successful and unsuccessful requests:**")
    print("Successful requests in % : ",success)
    print("Unsuccessful requests in % : ",not_success)


    cli()
    return


In [6]:
#function to get top 10 unsuccessful requests
def top_10_unsuccessful_requests():
        
    #to convert this into a user input in later version
    n = 10
    printmd ("**Top 10 Unsuccessful requests:**")
    pprint.pprint(not_success_log['Resource_path'].value_counts()[:n])
    cli()
    return 

In [7]:
#function to normalised percentage of requests
def Percentile_of_requests():
    printmd ("**Normalised percentages of requests:**")
    pprint.pprint(log['Status_code'].value_counts(normalize=True))
    cli()
    return

In [8]:
#function to get top 10 hosts
def top_10_hosts():
    
    #can make n as user input variable in next version
    n = 10
    printmd ("**Top 10 hosts interacting with the system:**")
    pprint.pprint(log['Source_ip'].value_counts()[:n])
    cli()
    return

In [9]:
#function to get page hit details of top 10 hosts
def detail_top_10_hosts():
    printmd ("**Top 10 hosts with their top page hits:**")
    pprint.pprint(log.groupby(['Source_ip', 'Resource_path']).size().sort_values(ascending=False).head(22))
    cli()

In [10]:
def cli():
    #Switch case implementation using dict
    switcher = {
            '1': top_10_most_requested_resources,
            '2': Percentage_of_successful_and_unsuccessful_requests,
            '3': top_10_unsuccessful_requests,
            '4': top_10_hosts,
            '5': detail_top_10_hosts,
            '6': Percentile_of_requests
            
    }

    printmd("**Select from the below options:**")

    print('1:Top 10 most requested resources')
    print('2:Percentage of Successful and Unsuccessful requests')
    print('3:Top 10 Unsuccessful requests')
    print('4:Top 10 hosts interacting with the system')
    print('5:Top 10 hosts with their top page hits')
    print('6:Percentile of requests')
    print('x:To close')
    user_input=input()
    clear_output(wait=True)

    #Function call
    #Edge cases like ranom text/integer inputs are covered
    if user_input in switcher :
            switcher[user_input]()
    elif user_input =='x':        
            clear_output(wait=True)
            printmd("**Thank you; You have successfully exited.**")
            return
    else :
            clear_output(wait=True)
            printmd("**Invalid input;Try again!.**")  
            cli()
    return


In [None]:
#User input console
#Run this to get options
cli()

**Select from the below options:**

1:Top 10 most requested resources
2:Percentage of Successful and Unsuccessful requests
3:Top 10 Unsuccessful requests
4:Top 10 hosts interacting with the system
5:Top 10 hosts with their top page hits
6:Percentile of requests
x:To close
