In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
# setting random seed for notebook reproducability
rnd_seed=23
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

In [3]:

spark_session = SparkSession.builder \
        .master("spark://192.168.2.46:7077") \
        .appName("Scalability_B") \
        .config("spark.dynamicAllocation.enabled", False) \
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
        .config("spark.shuffle.service.enabled", False) \
        .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
        .config("spark.dynamicAllocation.minExecutors", 1) \
        .config("spark.dynamicAllocation.maxExecutors", 10) \
        .config("spark.executor.cores", 4) \
        .config("spark.executor.memory", "4G") \
        .config("spark.driver.memory", "2G") \
        .config("spark.driver.port", 9999) \
        .config("spark.blockManager.port", 10005) \
        .getOrCreate()

spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 07:55:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
sqlContext = SQLContext(spark_session.sparkContext)



In [13]:
start_time = time.time()

df = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.1, seed=42)    
end_time = time.time()
print(f"10% : Execution Time = {end_time - start_time:.2f} seconds")




10% : Execution Time = 335.01 seconds


                                                                                

In [6]:
df.show(10)

                                                                                

+-------------------+--------------------+--------------------+-----------+-------+--------------------+-------------------+------------+--------------------+-----------+-----+
|             author|                body|             content|content_len|     id|      normalizedBody|          subreddit|subreddit_id|             summary|summary_len|title|
+-------------------+--------------------+--------------------+-----------+-------+--------------------+-------------------+------------+--------------------+-----------+-----+
|           chrom_ed|So you're saying ...|So you're saying ...|        134|c6agxtv|So you're saying ...|              apple|    t5_2qh1f|you don't seem to...|          9| NULL|
|        fallsuspect|You probably won'...|You probably won'...|         79|c6bncqn|You probably won'...|          AskReddit|    t5_2qh1i|just get both of ...|         11| NULL|
|     Buck_Speedjunk|This picture does...|This picture does...|         18|c6c4uks|This picture does...|           

In [14]:
#df_25 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.25, seed=42)

start_time = time.time()

df_25 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.25, seed=42)    
end_time = time.time()
print(f"25% : Execution Time = {end_time - start_time:.2f} seconds")



25% : Execution Time = 151.84 seconds


                                                                                

In [15]:
#df_5 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.5, seed=42)

start_time = time.time()

df_5 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.5, seed=42)    
end_time = time.time()
print(f"5% : Execution Time = {end_time - start_time:.2f} seconds")



5% : Execution Time = 172.34 seconds


                                                                                

In [16]:
#df_75 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.75, seed=42)

start_time = time.time()

df_75 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.75, seed=42)    
end_time = time.time()
print(f"75% : Execution Time = {end_time - start_time:.2f} seconds")



75% : Execution Time = 178.96 seconds


                                                                                

In [17]:
#df_95 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.95, seed=42)

start_time = time.time()

df_95 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.95, seed=42)    
end_time = time.time()
print(f"95% : Execution Time = {end_time - start_time:.2f} seconds")



95% : Execution Time = 112.08 seconds


                                                                                

In [18]:
#df_1 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.01, seed=42)

start_time = time.time()

df_1 = sqlContext.read.json("hdfs://192.168.2.46:9000/data/corpus-webis-tldr-17.json").sample(fraction=0.01, seed=42)    
end_time = time.time()
print(f"1% : Execution Time = {end_time - start_time:.2f} seconds")



1% : Execution Time = 178.84 seconds


                                                                                

In [11]:
import time

def test_scalability(df, label):
    start_time = time.time()
    
    # Example transformation and action
    result = df.groupBy("content_len").count().collect()
    
    end_time = time.time()
    print(f"{label}: Execution Time = {end_time - start_time:.2f} seconds")

# Run tests
test_scalability(df, "10% Dataset")
test_scalability(df_75, "75% Dataset")
test_scalability(df_95, "95% Dataset")
test_scalability(df_1, "1% Dataset")

                                                                                

10% Dataset: Execution Time = 107.35 seconds


                                                                                

75% Dataset: Execution Time = 97.86 seconds


                                                                                

95% Dataset: Execution Time = 99.05 seconds




1% Dataset: Execution Time = 89.07 seconds


                                                                                

In [19]:
spark_session.stop()