In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import *

In [2]:
sc = SparkContext()

In [3]:
sqlContext = SQLContext(sc)

In [9]:
logs = sqlContext.read.text('access_clean.log')

In [10]:
logs.show()

+--------------------+
|               value|
+--------------------+
|109.169.248.247 -...|
|109.169.248.247 -...|
|46.72.177.4 - - [...|
|46.72.177.4 - - [...|
|83.167.113.100 - ...|
|83.167.113.100 - ...|
|95.29.198.15 - - ...|
|95.29.198.15 - - ...|
|109.184.11.34 - -...|
|109.184.11.34 - -...|
|91.227.29.79 - - ...|
|91.227.29.79 - - ...|
|90.154.66.233 - -...|
|90.154.66.233 - -...|
|95.140.24.131 - -...|
|95.140.24.131 - -...|
|95.188.245.16 - -...|
|95.188.245.16 - -...|
|46.72.213.133 - -...|
|46.72.213.133 - -...|
+--------------------+
only showing top 20 rows



In [11]:
log_data=logs.rdd.map(lambda x:x[0].split(" ")).map(lambda arr:Row(remote_host=arr[0],timestamp=arr[3].replace("[",""),request_type=arr[5],url=arr[6],status_code=arr[8]))

In [12]:
log_data.collect()

[Row(remote_host='109.169.248.247', timestamp='12/Dec/2015:18:25:11', request_type='GET', url='/administrator/', status_code='200'),
 Row(remote_host='109.169.248.247', timestamp='12/Dec/2015:18:25:11', request_type='POST', url='/administrator/index.php', status_code='200'),
 Row(remote_host='46.72.177.4', timestamp='12/Dec/2015:18:31:08', request_type='GET', url='/administrator/', status_code='200'),
 Row(remote_host='46.72.177.4', timestamp='12/Dec/2015:18:31:08', request_type='POST', url='/administrator/index.php', status_code='200'),
 Row(remote_host='83.167.113.100', timestamp='12/Dec/2015:18:31:25', request_type='GET', url='/administrator/', status_code='200'),
 Row(remote_host='83.167.113.100', timestamp='12/Dec/2015:18:31:25', request_type='POST', url='/administrator/index.php', status_code='200'),
 Row(remote_host='95.29.198.15', timestamp='12/Dec/2015:18:32:10', request_type='GET', url='/administrator/', status_code='200'),
 Row(remote_host='95.29.198.15', timestamp='12/Dec/2

In [13]:
dataframe = log_data.toDF()

In [15]:
dataframe.show()

+---------------+--------------------+------------+--------------------+-----------+
|    remote_host|           timestamp|request_type|                 url|status_code|
+---------------+--------------------+------------+--------------------+-----------+
|109.169.248.247|12/Dec/2015:18:25:11|         GET|     /administrator/|        200|
|109.169.248.247|12/Dec/2015:18:25:11|        POST|/administrator/in...|        200|
|    46.72.177.4|12/Dec/2015:18:31:08|         GET|     /administrator/|        200|
|    46.72.177.4|12/Dec/2015:18:31:08|        POST|/administrator/in...|        200|
| 83.167.113.100|12/Dec/2015:18:31:25|         GET|     /administrator/|        200|
| 83.167.113.100|12/Dec/2015:18:31:25|        POST|/administrator/in...|        200|
|   95.29.198.15|12/Dec/2015:18:32:10|         GET|     /administrator/|        200|
|   95.29.198.15|12/Dec/2015:18:32:11|        POST|/administrator/in...|        200|
|  109.184.11.34|12/Dec/2015:18:32:56|         GET|     /administ

In [19]:
#find out how many 404 HTTP logs are in access logs

In [18]:
dataframe.filter(dataframe.status_code=="404").count()

227089

In [21]:
df2=dataframe.na.fill("404",["status_code"])

In [37]:
from pyspark.sql.functions import col

In [40]:
#Describe which HTTP status values appear in data and how many

http_df = df2.groupby(df2['status_code']).count().sort(col('count').desc()).show()

+-----------+-------+
|status_code|  count|
+-----------+-------+
|        200|1157831|
|        206| 939929|
|        404| 227089|
|        304|   6330|
|        500|   3252|
|        403|   2222|
|        301|    619|
|        303|    247|
|        401|    153|
|        501|    143|
|        405|     83|
|        406|     53|
|        400|     23|
|        412|     19|
|   HTTP/1.1|     13|
+-----------+-------+



In [41]:
#unique host and their average request
df2.select('remote_host').distinct().count()

40836

In [44]:
df2.groupby('remote_host').count().sort(col('count').desc()).show()

+--------------+------+
|   remote_host| count|
+--------------+------+
|198.50.156.189|167812|
| 5.112.235.245|166722|
| 5.114.231.216|158258|
|  5.113.18.208|157674|
| 91.218.225.68|134376|
| 79.62.229.212|114799|
|  149.56.83.40| 97533|
|  5.114.64.184| 94043|
| 5.113.216.211| 89125|
|  158.69.5.181| 88875|
|   5.113.35.73| 50224|
| 88.202.188.67| 42729|
| 5.114.237.218| 38755|
|  80.84.55.206| 37521|
|  5.112.66.178| 36564|
|205.167.170.15| 33302|
| 198.27.83.122| 22854|
|134.249.53.185| 17904|
| 178.159.37.81| 16358|
| 158.69.252.83| 14189|
+--------------+------+
only showing top 20 rows

