### Objective:
- perform EDA on AWS Web Crawl Data

In [1]:
from pyspark.sql import functions as F
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql import types

import pandas as pd
import numpy as np

#### Configuration

In [2]:
config = SparkConf()\
    .setAppName('AWS_Crawl')\
    .setMaster('local[*]')

#### Spark Session

In [3]:
spark = SparkSession \
    .builder \
    .config(conf = config)\
    .getOrCreate()

#### Read file from web crawl

In [4]:
# df_web = spark.read\
#     .option('header', True)\
#     .csv("../data_sources/AWS_web_crawl/cdx-00000")

df_web = spark.read\
    .csv("../data_sources/AWS_web_crawl/cdx-00000")

In [26]:
df_web.show(5)

+---+---+---+--------------------+--------------------+--------------------+----------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|_c0|_c1|_c2|                 _c3|                 _c4|                 _c5|             _c6|                 _c7|              _c8|                 _c9|                _c10|                _c11|                _c12|
+---+---+---+--------------------+--------------------+--------------------+----------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|  0| 11|248|207)/ 20231209181...| "mime": "text/html"| "mime-detected":...| "status": "200"| "digest": "V33MY...|  "length": "985"|  "offset": "246822"| "filename": "cra...| "charset": "ISO-...| "languages": "eng"}|
|  0| 11|248|207)/robots.txt 2...| "mime": "text/html"| "mime-detected":...| "status": "404"| "digest": "AS23R...| "length": "1112"|

In [20]:
df_web[["_c0", "_c1", "_c2","_c3"]].show(5)

+---+---+---+--------------------+
|_c0|_c1|_c2|                 _c3|
+---+---+---+--------------------+
|  0| 11|248|207)/ 20231209181...|
|  0| 11|248|207)/robots.txt 2...|
|  0| 11|248|207:5661)/robots....|
|  0| 11|248|207:5661)/sipweb/...|
|  0|111|128|178)/ 20231210132...|
+---+---+---+--------------------+
only showing top 5 rows



In [28]:
df_web[["_c10"]]

DataFrame[_c10: string]

In [40]:
df_web_sample= df_web.sample(0.05)

In [43]:
# %%time
# len(df_web_sample.count())

## Schema design

In [None]:
schema_= types.StructType(
    types.StructField()
)

#### Total Lenght of the file

In [21]:
len(df_web.columns)

13

In [6]:
# len(df_web.collect())

In [9]:
df_web.registerTempTable("web_database")

#### Perform SQL Querries

In [10]:
spark.sql(
    """
    SELECT COUNT(*) Total_Rows
    FROM web_database;
    """
).show()

+----------+
|Total_Rows|
+----------+
|  12554677|
+----------+



#### Read parquet with pandas

In [11]:
df_parq= pd.read_parquet("data_sources/first_AWS_web_Crawl_partitioned/part-00000-e767fe13-c5f2-4f7f-b7f4-4730bb02c5d3-c000.snappy.parquet")

In [12]:
df_parq.columns

Index(['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9',
       '_c10', '_c11', '_c12'],
      dtype='object')

In [13]:
df_parq["_c0"]

0         234
1         247
2          22
3         229
4         163
         ... 
627732    app
627733    app
627734    app
627735    app
627736    app
Name: _c0, Length: 627737, dtype: object

In [49]:
df_parq.head(2)

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12
0,234,26,116,"188)/ninjaqq 20231211145053 {""url"": ""https://1...","""mime"": ""warc/revisit""","""status"": ""304""","""length"": ""530""","""offset"": ""3746601""","""filename"": ""crawl-data/CC-MAIN-2023-50/segme...",,,,
1,247,175,1,3)/post/naga321-agen-mega-slot-99 202312100823...,"""mime"": ""text/html""","""mime-detected"": ""text/html""","""status"": ""200""","""digest"": ""WWYHNELFKRNADYS5VKQAO5LS2FNQNZMX""","""length"": ""18571""","""offset"": ""98699237""","""filename"": ""crawl-data/CC-MAIN-2023-50/segme...","""charset"": ""UTF-8""","""languages"": ""ind"


#### Repartition the DataFrame
- Repartioning the `SINGLE` data, so every executors can run the process without actually waiting for the first `Executor` to complete
- File `SUCESS` will be included on the same directory after sucessful completion

In [42]:
df_web.repartition(20)\
    .write.parquet("data_sources/first_AWS_web_Crawl_partitioned", mode= "overwrite")

#### Read the Partitioned parquet data

In [43]:
df_web= spark.read.parquet("data_sources/first_AWS_web_Crawl_partitioned/")

In [44]:
df_web.show(1)

+---+---+---+--------------------+--------------------+--------------------+----------------+--------------------+----------------+-----------------+--------------------+----+----+
|_c0|_c1|_c2|                 _c3|                 _c4|                 _c5|             _c6|                 _c7|             _c8|              _c9|                _c10|_c11|_c12|
+---+---+---+--------------------+--------------------+--------------------+----------------+--------------------+----------------+-----------------+--------------------+----+----+
|  2|205| 26|163)/robots.txt 2...| "mime": "text/html"| "mime-detected":...| "status": "404"| "digest": "C3T3E...| "length": "680"| "offset": "4892"| "filename": "cra...|NULL|NULL|
+---+---+---+--------------------+--------------------+--------------------+----------------+--------------------+----------------+-----------------+--------------------+----+----+
only showing top 1 row



In [45]:
df_web.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)



In [None]:
df_web

In [53]:
spark.sql(
    """
    SELECT _c0, _c1, 
    FROM web_database
    LIMIT 5;
    """
).show()

AnalysisException: [UNRESOLVED_COLUMN.WITHOUT_SUGGESTION] A column or function parameter with name `_c0` cannot be resolved. ; line 2 pos 11;
'GlobalLimit 5
+- 'LocalLimit 5
   +- 'Project ['_c0, '_c1, 'FROM AS web_database#685]
      +- OneRowRelation
