In [1]:
import pandas as pd
from pyspark import SparkContext
import plotly.graph_objects as go
from pyspark.sql import SQLContext

# Spark Data Analysis
Spark outputs is aggregated data into parquet files. I will use this file to analyze the business classification findings.

In [2]:
sc = SparkContext(appName='JSON Schema')
sqlContext = SQLContext(sc)

In [3]:
df = sqlContext.read.parquet("./cc-pyspark/spark-warehouse/business_classification/spark-warehouse")

In [4]:
df.show()

+--------------------+-----+
|               topic|count|
+--------------------+-----+
|  talk.politics.guns|  978|
|comp.os.ms-window...| 1483|
|           tiny.page| 8419|
|         alt.atheism|  534|
|  talk.politics.misc| 4047|
|  rec.sport.baseball| 1752|
|     sci.electronics| 3171|
|talk.politics.mid...| 1581|
|        misc.forsale|14206|
|               adult| 1111|
|soc.religion.chri...| 3786|
|error.unknown.rec...|    3|
|      comp.windows.x| 2570|
|   error.lang.detect|   18|
|           sci.space| 2888|
|           sci.crypt| 1918|
|    rec.sport.hockey| 1866|
|comp.sys.mac.hard...|  776|
|comp.sys.ibm.pc.h...|  932|
|             sci.med| 6019|
+--------------------+-----+
only showing top 20 rows



In [5]:
cc_web_page_count=2600000000
crawled_percent=web_page_count/cc_web_page_count
crawled_percent

NameError: name 'web_page_count' is not defined

In [None]:
137952-72938

In [None]:
labels = ['non.english.page', 'english.page']
values = [72938, 65014]

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0,0,0,0,0,0,0,0,0,0,0,0,0,0.2])])
fig.show()

In [None]:
#Need to remove a few categories: 
labels = ['talk.religion.misc', 'talk.politics.misc', 'talk.politics.mideast', 'talk.politics.guns', 'soc.religion.christian', 'sci.space', 'sci.med', 'sci.electronics', 'sci.crypt', 'rec.sport.hockey', 'rec.sport.baseball', 'rec.motorcycles', 'rec.autos', 'misc.forsale', 'comp.windows.x', 'comp.sys.mac.hardware', 'comp.sys.ibm.pc.hardware', 'comp.os.ms-windows.misc', 'comp.graphics', 'alt.atheism', 'adult']
values = [300, 4047, 1581, 978, 3786, 2888, 6019, 3171, 1918, 1866, 1752, 1658, 2253, 14206, 2570, 776, 932, 1483, 2745, 534, 1111]

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0,0,0,0,0,0,0,0,0,0,0,0,0,0.2])])
fig.show()

In [None]:
#Load in the Parquet file
parq_df = sqlContext.read.parquet("./cc-pyspark/spark-warehouse/business_classifier3")

In [None]:
web_page_count = parq_df.groupBy().sum().show()

In [None]:
for row in parq_df.sort(parq_df.topic.desc()).take(25): print(row)

In [None]:
#convert to a Dataframe because it's easier to work with
new_df = pd.DataFrame(columns=['topic', 'count'])

topic_list=[]
count_list=[]

for row in parq_df.sort(parq_df.topic.desc()).take(25):
    topic_list.append(row[0])
    count_list.append(row[1])
    
new_df['topic'] = topic_list
new_df['count'] = count_list

In [None]:
new_df.head(50)

In [None]:
#total number of records
new_df["count"].sum()

In [None]:
#English versus non-english web pages
labels = ['non.english.page', 'english.page']
non_eng = new_df[(new_df['topic'] == "non.english.page")]["count"].values[0]
eng = new_df[(new_df['topic'] != "non.english.page")]["count"].sum()
values = [non_eng, eng]

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0,0])])
fig.update_traces(textposition='inside', textinfo='percent+label+value')
fig.show()

In [None]:
#Remove topic counts that will not be included
mask = ((new_df['topic'] != "tiny.page") & 
        (new_df['topic'] != "non.english.page") &
        (new_df['topic'] != "error.unknown.record.type") &
        (new_df['topic'] != "error.lang.detect"))
final_df = new_df[mask]
final_df

In [None]:
#Need to remove a few categories: 
labels = final_df["topic"]
values = final_df["count"]

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0,0,0,0,0,0,0,0,0,0,0,0,0,0.2])])
fig.update_traces(textposition='inside', textinfo='percent+label+value')
fig.show()