In [40]:
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession
    
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at /var/folders/q8/5zm40l013b957d609xmckgn00000gp/T/ipykernel_25086/3539456616.py:5 

In [41]:
import re
from datetime import datetime

In [42]:
import glob

raw_data_files = glob.glob('data/*.gz')
raw_data_files

['data/NASA_access_log_Jul95.gz', 'data/NASA_access_log_Aug95.gz']

In [43]:
raw_logs_df = spark.read.text(raw_data_files)
raw_logs_df

DataFrame[value: string]

### Viewing sample data in our dataframe

In [44]:
raw_logs_df.take(10)

[Row(value='199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245'),
 Row(value='unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985'),
 Row(value='199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085'),
 Row(value='burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0'),
 Row(value='199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179'),
 Row(value='burger.letters.com - - [01/Jul/1995:00:00:12 -0400] "GET /images/NASA-logosmall.gif HTTP/1.0" 304 0'),
 Row(value='burger.letters.com - - [01/Jul/1995:00:00:12 -0400] "GET /shuttle/countdown/video/livevideo.gif HTTP/1.0" 200 0'),
 Row(value='205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] "GET /shuttle/countdown/countdown.html HTTP/1.0" 200 3985'),
 Row(value='d104.aa.net - - [01/Ju

In [6]:
print((raw_logs_df.count(), len(raw_logs_df.columns)))

[Stage 1:>                                                          (0 + 2) / 2]

(3461613, 1)


                                                                                

In [45]:
a_log = raw_logs_df.first()
a_log

Row(value='199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245')

In [46]:
re_host = '(^\S+\.[\S+\.]+\S+2)'
re_time = '\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
re_method_uri_protocol = '\"(\S+)\s(\S+)\s*(\S*)\"'
re_status = '\s(\d{3})\s'
re_content_size = '(\d+)$'
log_pattern = f'{re_host}\s-\s-\s{re_time}\s{re_method_uri_protocol}{re_status}{re_content_size}'


In [47]:
match = re.search(log_pattern, a_log.value)
print(match)

None


In [48]:
from pyspark.sql import functions as F
from pyspark.sql import types as SparkTypes

parsed_logs_df = raw_logs_df.select(F.regexp_extract('value', re_host, 1).alias('host'),
                        F.regexp_extract('value', re_time, 1).alias('timestamp'),
                        F.regexp_extract('value', re_method_uri_protocol, 1).alias('method'),
                        F.regexp_extract('value', re_method_uri_protocol, 2).alias('endpoint'),
                        F.regexp_extract('value', re_method_uri_protocol, 3).alias('protocol'),
                        F.regexp_extract('value', re_status, 1).alias('status'),
                        F.regexp_extract('value', re_content_size, 1).alias('content_size'),
                        raw_logs_df.value.alias('raw')
                        )
parsed_logs_df.cache()

DataFrame[host: string, timestamp: string, method: string, endpoint: string, protocol: string, status: string, content_size: string, raw: string]

In [49]:
parsed_logs_df.take(1)

[Stage 23:>                                                         (0 + 1) / 1]

23/05/27 18:29:09 WARN MemoryStore: Not enough space to cache rdd_113_0 in memory! (computed 244.4 MiB so far)
23/05/27 18:29:09 WARN BlockManager: Persisting block rdd_113_0 to disk instead.
23/05/27 18:29:12 WARN MemoryStore: Not enough space to cache rdd_113_0 in memory! (computed 244.4 MiB so far)


                                                                                

[Row(host='', timestamp='01/Jul/1995:00:00:01 -0400', method='GET', endpoint='/history/apollo/', protocol='HTTP/1.0', status='200', content_size='6245', raw='199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245')]

In [64]:
parsed_logs_df = parsed_logs_df.filter(
        (F.col('host') != '') &
        (F.col('timestamp') != '') &
        (F.col('method') != '') &
        (F.col('endpoint') != '') &
        (F.col('protocol') != '') &
        (F.col('status') != '') &
        (F.col('content_size') != '')
    ) 
normalized_logs_df.count()

23/05/27 18:33:48 WARN MemoryStore: Not enough space to cache rdd_113_0 in memory! (computed 155.2 MiB so far)


3461613

In [75]:
# normalized_logs_df = parsed_logs_df.withColumn('timestamp', F.udf(lambda s: datetime.strptime(s, "%d/%b/%Y:%H:%M:%S %z").date())("timestamp"))
# normalized_logs_df = normalized_logs_df.withColumn('status', F.udf(lambda s: int(s))('status'))
# normalized_logs_df = normalized_logs_df.withColumn('content_size', F.udf(lambda s: int(s))('content_size'))
normalized_logs_df = parsed_logs_df.withColumns({
    "timestamp": F.udf(lambda s: datetime.strptime(s, "%d/%b/%Y:%H:%M:%S %z").strftime("%Y-%m-%dT%H:%M:%S%z"), SparkTypes.StringType())("timestamp"),
    "status": F.udf(lambda s: int(s), SparkTypes.IntegerType())('status'),
    "content_size": F.udf(lambda s: int(s), SparkTypes.IntegerType())('content_size'),
})
# normalized_logs_df = parsed_logs_df.withColumns({
#     "timestamp": F.to_timestamp(F.col('timestamp').cast(SparkTypes.TimestampType()), '%d/%b/%Y:%H:%M:%S %z'),
# }).

In [76]:
normalized_logs_df.show(10)
normalized_logs_df.printSchema()

23/05/27 18:51:03 WARN MemoryStore: Not enough space to cache rdd_113_0 in memory! (computed 155.2 MiB so far)


[Stage 40:>                                                         (0 + 1) / 1]

+--------------+--------------------+------+--------------------+--------+------+------------+--------------------+
|          host|           timestamp|method|            endpoint|protocol|status|content_size|                 raw|
+--------------+--------------------+------+--------------------+--------+------+------------+--------------------+
| 199.120.110.2|1995-07-01T00:00:...|   GET|/shuttle/missions...|HTTP/1.0|   200|        4085|199.120.110.21 - ...|
| 199.120.110.2|1995-07-01T00:00:...|   GET|/shuttle/missions...|HTTP/1.0|   200|        4179|199.120.110.21 - ...|
|       205.212|1995-07-01T00:00:...|   GET|/shuttle/countdow...|HTTP/1.0|   200|        3985|205.212.115.106 -...|
|129.94.144.152|1995-07-01T00:00:...|   GET|                   /|HTTP/1.0|   200|        7074|129.94.144.152 - ...|
|129.94.144.152|1995-07-01T00:00:...|   GET|/images/ksclogo-m...|HTTP/1.0|   304|           0|129.94.144.152 - ...|
| 199.120.110.2|1995-07-01T00:00:...|   GET|/images/launch-lo...|HTTP/1.

                                                                                

In [81]:
bad_rows_df = parsed_logs_df.filter(parsed_logs_df['host'].isNull()| 
                                    parsed_logs_df['timestamp'].isNull() | 
                                    parsed_logs_df['method'].isNull() |
                                    parsed_logs_df['endpoint'].isNull() |
                                    parsed_logs_df['status'].isNull() |
                                    parsed_logs_df['content_size'].isNull()|
                                    parsed_logs_df['protocol'].isNull())
bad_rows_df.count()

0

In [15]:
good_rows_df = parsed_logs_df.filter(
        F.col('host').isNotNull() &
        F.col('timestamp').isNotNull() &
        F.col('method').isNotNull() &
        F.col('endpoint').isNotNull() &
        F.col('protocol').isNotNull() &
        F.col('status').isNotNull() &
        F.col('content_size').isNotNull()
    )
good_rows_df.count()

                                                                                

3461613

In [82]:
parsed_logs_df.printSchema()

root
 |-- host: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- method: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- status: string (nullable = true)
 |-- content_size: string (nullable = true)



In [None]:
s = "01/Jul/1995:00:00:01 -0400"
datetime.strptime(s, "%d/%b/%Y:%H:%M:%S %z")

In [170]:
parsed_logs_df.write.csv("parsed_logs")

                                                                                