In [10]:
import re
#re library being used 
# code snippet that will parse an Apache access log line into a bunch of fields 
format_pat= re.compile(
    r"(?P<host>[\d\.]+)\s"
    r"(?P<identity>\S*)\s"
    r"(?P<user>\S*)\s"
    r"\[(?P<time>.*?)\]\s"
    r'"(?P<request>.*?)"\s'
    r"(?P<status>\d+)\s"
    r"(?P<bytes>\S*)\s"
    r'"(?P<referer>.*?)"\s'
    r'"(?P<user_agent>.*?)"\s*'
)
#the above fields are used by browser to view the page, together they make a regular expression. 
#using this regular expression by applying to each line of our access log , the information bits are grouped together into the different fields.
#above is a very powerful language for doing pattern matching on a large string. 

In [2]:
#analyzing a real log file- a real http access log from Apache 
logPath = "access_log.txt"

In [3]:
#the script below counts up each URL encountered that was requested and no. of times it was requested
#the idea is to sort the list containing URL and its count and then get the results.
URLCounts = {} #creating a dictionary

with open(logPath, "r") as f:
#to open a file in reading mode in python so as to extract contents and store the file in an object
    for line in (l.rstrip() for l in f):
    #extracting each line from the f object containing text file 
        match= format_pat.match(line)
        #applying our regular expression on each line of file 
        if match:
        #if any line matches the format of our regular expression i.e contains the fields 
            access = match.groupdict()

            request = access['request']
            #we are accessing the request field out of it i.e actual http request (page being requested out of browser)
            (action, URL, protocol) = request.split() #splitting the request into the fields of "action", "URL" and "protocol" 
            if URL in URLCounts: #checking if the URL of the current weblog already exists in our dictionary 
                URLCounts[URL] = URLCounts[URL] + 1 #the page visit count is increased
            else:
                URLCounts[URL] = 1 #the URL added as index in dictionary and visit count initiated as 1
    #the URL extracted from everyline of the weblog file with the above operation being done
results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)
#the URLCounts dict is stored in results object and then sorted in reverse manner 
for result in results[:20]: #iterating through the first 20 elememnts of results 
    print(result + ": " + str(URLCounts[result]))

#the error we experience due to above code snippet is "we need more than one value to unpack"
#it means some request fields are not having all three "action", "URL" and "protocol" 

ValueError: not enough values to unpack (expected 3, got 1)

In [4]:
URLCounts = {}
#creating URLCounts dictionary to hold the URL in weblogs
with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line)
        if match:
            access = match.groupdict()
            request = access['request']
            fields = request.split()
            #splitting the request line in fields as per the parameters present 
            if (len(fields) != 3):
            #the length of the fields of request line is not equal to 3 then printing those fields 
                print(fields)
            #we get empty fields 

['_\\xb0ZP\\x07tR\\xe5']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [11]:
#code snippet to check for garbage values
URLCounts = {}

with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line)
        if match:
            access = match.groupdict()
            request = access['request']
            fields = request.split()
            if (len(fields) == 3):#taking entry of those fields only whose no. of fields = 3
                URL = fields[1]
                if URL in URLCounts:
                    URLCounts[URL] = URLCounts[URL] + 1
                else:
                    URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    #print(result + ": " + str(URLCounts[result]))

#we will get couple of PHP files these are pearl scripts,XML files and so on 
#not a useful result 
#the "/xmlrpc.php: 68494" denotes the site was under malicious attack and this perl script was being used to guess password. 
#the "/wp-login.php: 1923" was the login script for malicious log in trials
#The attacks attempted to execute stuff as well which is logged on weblog
# result - wanted : GET requests; unwanted : perl scripts, execution logs

/xmlrpc.php: 68494
/wp-login.php: 1923
/: 440
/blog/: 138
/robots.txt: 123
/sitemap_index.xml: 118
/post-sitemap.xml: 118
/page-sitemap.xml: 117
/category-sitemap.xml: 117
/orlando-headlines/: 95
/san-jose-headlines/: 85
http://51.254.206.142/httptest.php: 81
/comics-2/: 76
/travel/: 74
/entertainment/: 72
/business/: 70
/national/: 70
/national-headlines/: 70
/world/: 70
/weather/: 70


In [12]:
#code snippet to process only GET requests
URLCounts = {}

with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line) #boolean variable 
        if match:
            access = match.groupdict()
            request = access['request']
            fields = request.split()
            if (len(fields) == 3):#checking the length of the fields to filter out empty or incomplete requests 
                (action, URL, protocol) = fields
                if (action == 'GET'):#extracting only GET requests for webpages by assigning the action parameter as GET 
                    if URL in URLCounts:
                        URLCounts[URL] = URLCounts[URL] + 1
                    else:
                        URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))
#the blog is getting more hits than the news page on the news website which is abnormal 
#Problem - many blog requests do not have user agent in them ; Cause - malicious traffic or scraper 
#humans with real browser - user agent : Mozilla or Firefox or Explorer 

/: 434
/blog/: 138
/robots.txt: 123
/sitemap_index.xml: 118
/post-sitemap.xml: 118
/page-sitemap.xml: 117
/category-sitemap.xml: 117
/orlando-headlines/: 95
/san-jose-headlines/: 85
http://51.254.206.142/httptest.php: 81
/comics-2/: 76
/travel/: 74
/entertainment/: 72
/business/: 70
/national/: 70
/national-headlines/: 70
/world/: 70
/weather/: 70
/about/: 69
/defense-sticking-head-sand/: 69


In [1]:
#code snippet to display different user agent and sort them
UserAgents = {}
#creating useragent dictionary 
with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line)
        if match:
            access = match.groupdict()
            agent = access['user_agent'] #accessing user_agent field from the access 
            if agent in UserAgents:
                UserAgents[agent] = UserAgents[agent] + 1
            else:
                UserAgents[agent] = 1

results = sorted(UserAgents, key=lambda i: int(UserAgents[i]), reverse=True)

for result in results:
    #print(result + ": " + str(UserAgents[result]))

#in result we see "Mozilla/4.0 (compatible: MSIE 7.0; Windows NT 6.0): 68484" - which is a malicious scraper pretending to be a legitimitate browser
# the dash "-" user agent : dont know the source but definitely not an actual browser 
# the useragents dict is polluted by web crawlers i.e search engine - Baidu (SE in China), Yandex - (SE in Russia), Google bot and Bing bot. These web crawlers or spiders mine the website for search engine purposes. 
# web crawlers and bot traffic doesnt count for analysis as it required the traffic by humans.(they are automated scripts)

SyntaxError: incomplete input (4221330639.py, line 23)

In [14]:
#Problem - Identifying spiders or robots user agents from user string alone
#code snippet to identify the spiders or robots or web crawlers 
URLCounts = {}

with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line)
        if match:
            access = match.groupdict()
            agent = access['user_agent']
            if (not('bot' in agent or 'spider' in agent or 
                    'Bot' in agent or 'Spider' in agent or
                    'W3 Total Cache' in agent or agent =='-')):
            #filtering out spiders and "-" user agent by filtering out the user agents containing word bot or spider or 'W3 Total Cache' anything in caching plugin 
                request = access['request']
                fields = request.split()
                if (len(fields) == 3):
                    (action, URL, protocol) = fields
                    if (action == 'GET'):
                        if URL in URLCounts:
                            URLCounts[URL] = URLCounts[URL] + 1
                        else:
                            URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))
#the results show that apart from webpages there are some scripts and bunch of css files 
#as we only require webpages and given: "the pages in site end with a slash in url"

/: 77
/orlando-headlines/: 36
/?page_id=34248: 28
/wp-content/cache/minify/000000/M9AvyUjVzUstLy7PLErVz8lMKkosqtTPKtYvTi7KLCgpBgA.js: 27
/wp-content/cache/minify/000000/M9bPKixNLarUy00szs8D0Zl5AA.js: 27
/wp-content/cache/minify/000000/lY7dDoIwDIVfiG0KxkfxfnbdKO4HuxICTy-it8Zw15PzfSftzPCckJem-x4qUWArqBPl5mygZLEgyhdOaoxToGyGaiALiOfUnIz0qDLOdSZGE-nOlpc3kopDzrSyavVVt_veb5qSDVhjsQ6dHh_B_eE_z2pYIGJ7iBWKeEio_eT9UQe4xHhDll27mGRryVu_pRc.js: 27
/wp-content/cache/minify/000000/fY45DoAwDAQ_FMvkRQgFA5ZyWLajiN9zNHR0O83MRkyt-pIctqYFJPedKyYzfHg2PzOFiENAzaD07AxcpKmTolORvDjZt8KEfhBUGjZYCf8Fb0fvA1TXCw.css: 25
/?author=1: 21
/wp-content/cache/minify/000000/hcrRCYAwDAXAhXyEjiQ1YKAh4SVSx3cE7_uG7ASr4M9qg3kGWyk1adklK84LHtRj_My6Y0Pfqcz-AA.js: 20
/wp-content/uploads/2014/11/nhn1.png: 19
/wp-includes/js/wp-emoji-release.min.js?ver=4.3.1: 17
/wp-content/cache/minify/000000/BcGBCQAgCATAiUSaKYSERPk3avzuht4SkBJnt4tHJdqgnPBqKldesTcN1R8.js: 17
/wp-login.php: 16
/comics-2/: 12
/world/: 12
/favicon.ico: 10
/wp-content/up

In [15]:
#code snippet that refines the search and removes any URL entry that doesnt end with a slash
URLCounts = {}

with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line)
        if match:
            access = match.groupdict()
            agent = access['user_agent']
            if (not('bot' in agent or 'spider' in agent or 
                    'Bot' in agent or 'Spider' in agent or
                    'W3 Total Cache' in agent or agent =='-')):
                request = access['request']
                fields = request.split()
                if (len(fields) == 3):
                    (action, URL, protocol) = fields
                    if (URL.endswith("/")):
                        if (action == 'GET'):
                            if URL in URLCounts:
                                URLCounts[URL] = URLCounts[URL] + 1
                            else:
                                URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))
#the results show that feed pages but in actuality many are because of robots trying to to get RSS data from website 

/: 77
/orlando-headlines/: 36
/comics-2/: 12
/world/: 12
/weather/: 4
/australia/: 4
/about/: 4
/national-headlines/: 3
/feed/: 2
/sample-page/feed/: 2
/science/: 2
/technology/: 2
/entertainment/: 1
/san-jose-headlines/: 1
/business/: 1
/travel/feed/: 1


In [16]:
URLCounts = {}
#code snippet to filter out the feed pages from frequently visited pages of the site 
with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line)
        if match:
            access = match.groupdict()
            agent = access['user_agent']
            if (not('bot' in agent or 'spider' in agent or 
                    'Bot' in agent or 'Spider' in agent or
                    'W3 Total Cache' in agent or agent =='-')):
                request = access['request']
                fields = request.split()
                if (len(fields) == 3):
                    (action, URL, protocol) = fields
                    if (URL.endswith("/")):
                        if (action == 'GET'):
                            if (not('feed' in URL)):
                                if URL in URLCounts:
                                    URLCounts[URL] = URLCounts[URL] + 1
                                else:
                                    URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))

/: 77
/orlando-headlines/: 36
/comics-2/: 12
/world/: 12
/weather/: 4
/australia/: 4
/about/: 4
/national-headlines/: 3
/science/: 2
/technology/: 2
/entertainment/: 1
/san-jose-headlines/: 1
/business/: 1


In [17]:
FeedAgents = {}
#code snippet to visualize the user agents accessing the feed pages 
with open(logPath, "r") as f:
    for line in (l.rstrip() for l in f):
        match= format_pat.match(line)
        if match:
            access = match.groupdict()
            agent = access['user_agent']
            if (not('bot' in agent or 'spider' in agent or 
                    'Bot' in agent or 'Spider' in agent or
                    'W3 Total Cache' in agent or agent =='-')):
                request = access['request']
                fields = request.split()
                if (len(fields) == 3):
                    (action, URL, protocol) = fields
                    if (URL.endswith("/")):
                        if (action == 'GET'):
                            if ('feed' in URL):
                                if agent in FeedAgents:
                                    FeedAgents[agent]+=1
                                else:
                                    FeedAgents[agent]=1


results = sorted(FeedAgents, key=lambda i: int(FeedAgents[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(FeedAgents[result]))

Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.66 Safari/537.36: 4
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0): 1
