In [1]:
# Import main packages
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
# Create Spark context
sparkConf = SparkConf()
sparkConf.setAppName("wiki")
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

22/12/01 11:34:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Read dataframe
df_cat = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("delimiter", "\t") \
    .option("mode", "DROPMALFORMED") \
    .load("hdfs://localhost:9000/user/bigdata2022/datasets/wiki/categories.tsv")
df_cat.show(5)

+--------------------+--------------------+
|             article|            category|
+--------------------+--------------------+
|%C3%81ed%C3%A1n_m...|subject.History.B...|
|%C3%81ed%C3%A1n_m...|subject.People.Hi...|
|          %C3%85land|   subject.Countries|
|          %C3%85land|subject.Geography...|
|  %C3%89douard_Manet|subject.People.Ar...|
+--------------------+--------------------+
only showing top 5 rows



In [4]:
from urllib.parse import unquote
rdd_cat = df_cat.rdd
rdd_cat  = rdd_cat.map(lambda x: (unquote(x.article),unquote(x.category)))
rdd_cat.collect()

[Stage 2:>                                                          (0 + 1) / 1]                                                                                

[('Áedán_mac_Gabráin',
  'subject.History.British_History.British_History_1500_and_before_including_Roman_Britain'),
 ('Áedán_mac_Gabráin', 'subject.People.Historical_figures'),
 ('Åland', 'subject.Countries'),
 ('Åland', 'subject.Geography.European_Geography.European_Countries'),
 ('Édouard_Manet', 'subject.People.Artists'),
 ('Éire', 'subject.Countries'),
 ('Éire', 'subject.Geography.European_Geography.European_Countries'),
 ('Óengus_I_of_the_Picts',
  'subject.History.British_History.British_History_1500_and_before_including_Roman_Britain'),
 ('Óengus_I_of_the_Picts', 'subject.People.Historical_figures'),
 ('€2_commemorative_coins', 'subject.Business_Studies.Currency'),
 ('10th_century', 'subject.History.General_history'),
 ('11th_century', 'subject.History.General_history'),
 ('12th_century', 'subject.History.General_history'),
 ('13th_century', 'subject.History.General_history'),
 ('14th_century', 'subject.History.General_history'),
 ('15th_Marine_Expeditionary_Unit',
  'subject.H

In [5]:
#output_cat = rdd_cat.map(lambda x: (x[0],x[1])) \
#            .reduceByKey(lambda a, b: a+";"+b)
#output_cat.collect()

In [6]:
# Read dataframe
df_link = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("delimiter", "\t") \
    .option("mode", "DROPMALFORMED") \
    .load("hdfs://localhost:9000/user/bigdata2022/datasets/wiki/links.tsv")
df_link.show(5)

+--------------------+--------------+
|              source|   destination|
+--------------------+--------------+
|%C3%81ed%C3%A1n_m...|          Bede|
|%C3%81ed%C3%A1n_m...|       Columba|
|%C3%81ed%C3%A1n_m...|D%C3%A1l_Riata|
|%C3%81ed%C3%A1n_m...| Great_Britain|
|%C3%81ed%C3%A1n_m...|       Ireland|
+--------------------+--------------+
only showing top 5 rows



In [7]:
from urllib.parse import unquote
rdd_link = df_link.rdd
rdd_link  = rdd_link.map(lambda x: (unquote(x.source),unquote(x.destination)))
rdd_link.collect()

                                                                                

[('Áedán_mac_Gabráin', 'Bede'),
 ('Áedán_mac_Gabráin', 'Columba'),
 ('Áedán_mac_Gabráin', 'Dál_Riata'),
 ('Áedán_mac_Gabráin', 'Great_Britain'),
 ('Áedán_mac_Gabráin', 'Ireland'),
 ('Áedán_mac_Gabráin', 'Isle_of_Man'),
 ('Áedán_mac_Gabráin', 'Monarchy'),
 ('Áedán_mac_Gabráin', 'Orkney'),
 ('Áedán_mac_Gabráin', 'Picts'),
 ('Áedán_mac_Gabráin', 'Scotland'),
 ('Áedán_mac_Gabráin', 'Wales'),
 ('Åland', '20th_century'),
 ('Åland', 'Baltic_Sea'),
 ('Åland', 'Crimean_War'),
 ('Åland', 'Currency'),
 ('Åland', 'Euro'),
 ('Åland', 'European_Union'),
 ('Åland', 'Finland'),
 ('Åland', 'League_of_Nations'),
 ('Åland', 'List_of_countries_by_system_of_government'),
 ('Åland', 'Nationality'),
 ('Åland', 'Parliamentary_system'),
 ('Åland', 'Police'),
 ('Åland', 'Russia'),
 ('Åland', 'Stockholm'),
 ('Åland', 'Sweden'),
 ('Åland', 'Time_zone'),
 ('Åland', 'Tourism'),
 ('Åland', 'United_Kingdom'),
 ('Åland', 'World_War_II'),
 ('Édouard_Manet', 'Absinthe'),
 ('Édouard_Manet', 'Beer'),
 ('Édouard_Manet', 'C

In [8]:
#output_link = rdd_link.map(lambda x: (x[0],x[1])) \
#            .reduceByKey(lambda a, b: a+";"+b)
#output_link.collect()

In [17]:
# Read dataframe
df_path = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("delimiter", "\t") \
    .option("mode", "DROPMALFORMED") \
    .load("hdfs://localhost:9000/user/bigdata2022/datasets/wiki/paths_finished.tsv")
df_path.show(5)

+----------------+----------+-------------+--------------------+------+
| hashedIpAddress| timestamp|durationInSec|                path|rating|
+----------------+----------+-------------+--------------------+------+
|6a3701d319fc3754|1297740409|          166|14th_century;15th...|  NULL|
|3824310e536af032|1344753412|           88|14th_century;Euro...|     3|
|415612e93584d30e|1349298640|          138|14th_century;Nige...|  NULL|
|64dd5cd342e3780c|1265613925|           37|14th_century;Rena...|  NULL|
|015245d773376aab|1366730828|          175|14th_century;Ital...|     3|
+----------------+----------+-------------+--------------------+------+
only showing top 5 rows



In [28]:
def formatRow(x):
    values = unquote(x.path).split(";")
    length = len(values)
    if(x.rating == "NULL"):
        rating = 0
        ratingNum = 0
    else:
        rating = int(x.rating)
        ratingNum = 1
    return (values[0]+"::"+values[-1],int(x.durationInSec),rating,ratingNum)

rdd_path = df_path.rdd
rdd_path  = rdd_path.map(lambda x: formatRow(x))
rdd_path.collect()

                                                                                

[('14th_century::African_slave_trade', 166, 0, 0),
 ('14th_century::African_slave_trade', 88, 3, 1),
 ('14th_century::African_slave_trade', 138, 0, 0),
 ('14th_century::Greece', 37, 0, 0),
 ('14th_century::John_F._Kennedy', 175, 3, 1),
 ('14th_century::John_F._Kennedy', 110, 0, 0),
 ('14th_century::Fire', 112, 2, 1),
 ('14th_century::Rainbow', 139, 1, 1),
 ('14th_century::Rainbow', 74, 3, 1),
 ('14th_century::Rainbow', 167, 0, 0),
 ('14th_century::Rainbow', 253, 3, 1),
 ('14th_century::Rainbow', 218, 3, 1),
 ('14th_century::Rainbow', 66, 0, 0),
 ('14th_century::Rainbow', 391, 5, 1),
 ('14th_century::Rainbow', 432, 0, 0),
 ('14th_century::Rainbow', 43, 0, 0),
 ('14th_century::Rainbow', 387, 2, 1),
 ('14th_century::Rainbow', 179, 3, 1),
 ('14th_century::Rainbow', 169, 0, 0),
 ('14th_century::Rainbow', 246, 0, 0),
 ('14th_century::Rainbow', 345, 3, 1),
 ('14th_century::Rainbow', 265, 4, 1),
 ('14th_century::Rainbow', 75, 1, 1),
 ('14th_century::Rainbow', 25, 0, 0),
 ('14th_century::Rainbo

In [37]:
rdd_rating  = rdd_path.map(lambda x: (x[0],x[2])) \
                .reduceByKey(lambda a, b: a+b)
rdd_rating.collect()

                                                                                

[('14th_century::African_slave_trade', 3),
 ('14th_century::Rainbow', 34),
 ('14th_century::Sodium', 3),
 ('14th_century::Henry_David_Thoreau', 5),
 ('2004_Atlantic_hurricane_season::Ice_age', 3),
 ('2005_Atlantic_hurricane_season::Spain', 1),
 ('2005_Atlantic_hurricane_season::Jazz', 13),
 ('2005_Atlantic_hurricane_season::Liechtenstein', 7),
 ('2005_Atlantic_hurricane_season::Sweden', 0),
 ('2005_Atlantic_hurricane_season::Burundi', 2),
 ('2005_Atlantic_hurricane_season::Hydrogen_peroxide', 0),
 ('2005_Atlantic_hurricane_season::International_Court_of_Justice', 0),
 ('2005_Atlantic_hurricane_season::William_McKinley', 2),
 ('Aberdeen::Vampire_bat', 8),
 ('Aberdeen::Bread', 0),
 ('Aberdeen::Sparrowhawk', 0),
 ('Acceleration::Dresden', 2),
 ('Acceleration::Elephant', 1),
 ('Acceleration::Trout', 15),
 ('Acceleration::Potassium_iodide', 7),
 ('Achilles::Gold', 3),
 ('Achilles::Great_white_shark', 3),
 ('Achilles::Bongo_(antelope)', 2),
 ('Achilles_tendon::Great_white_shark', 2),
 ('Afri

In [38]:
rdd_ratingsNum  = rdd_path.map(lambda x: (x[0],x[3])) \
                    .reduceByKey(lambda a, b: a+b)
rdd_ratingsNum.collect()

                                                                                

[('14th_century::African_slave_trade', 1),
 ('14th_century::Rainbow', 13),
 ('14th_century::Sodium', 1),
 ('14th_century::Henry_David_Thoreau', 1),
 ('2004_Atlantic_hurricane_season::Ice_age', 2),
 ('2005_Atlantic_hurricane_season::Spain', 1),
 ('2005_Atlantic_hurricane_season::Jazz', 4),
 ('2005_Atlantic_hurricane_season::Liechtenstein', 2),
 ('2005_Atlantic_hurricane_season::Sweden', 0),
 ('2005_Atlantic_hurricane_season::Burundi', 1),
 ('2005_Atlantic_hurricane_season::Hydrogen_peroxide', 0),
 ('2005_Atlantic_hurricane_season::International_Court_of_Justice', 0),
 ('2005_Atlantic_hurricane_season::William_McKinley', 1),
 ('Aberdeen::Vampire_bat', 2),
 ('Aberdeen::Bread', 0),
 ('Aberdeen::Sparrowhawk', 0),
 ('Acceleration::Dresden', 1),
 ('Acceleration::Elephant', 1),
 ('Acceleration::Trout', 4),
 ('Acceleration::Potassium_iodide', 2),
 ('Achilles::Gold', 1),
 ('Achilles::Great_white_shark', 1),
 ('Achilles::Bongo_(antelope)', 1),
 ('Achilles_tendon::Great_white_shark', 1),
 ('Africa

In [42]:
def divide(x,y):
    return round(x/y) if y else 0 
    
rdd_rating2 = rdd_rating.union(rdd_ratingsNum).reduceByKey(lambda x,y : divide(x,y))
rdd_rating2.collect()

                                                                                

[('14th_century::African_slave_trade', 3),
 ('14th_century::Rainbow', 3),
 ('14th_century::Sodium', 3),
 ('14th_century::Henry_David_Thoreau', 5),
 ('2004_Atlantic_hurricane_season::Ice_age', 2),
 ('2005_Atlantic_hurricane_season::Spain', 1),
 ('2005_Atlantic_hurricane_season::Jazz', 3),
 ('2005_Atlantic_hurricane_season::Liechtenstein', 4),
 ('2005_Atlantic_hurricane_season::Sweden', 0),
 ('2005_Atlantic_hurricane_season::Burundi', 2),
 ('2005_Atlantic_hurricane_season::Hydrogen_peroxide', 0),
 ('2005_Atlantic_hurricane_season::International_Court_of_Justice', 0),
 ('2005_Atlantic_hurricane_season::William_McKinley', 2),
 ('Aberdeen::Vampire_bat', 4),
 ('Aberdeen::Bread', 0),
 ('Aberdeen::Sparrowhawk', 0),
 ('Acceleration::Dresden', 2),
 ('Acceleration::Elephant', 1),
 ('Acceleration::Trout', 4),
 ('Acceleration::Potassium_iodide', 4),
 ('Achilles::Gold', 3),
 ('Achilles::Great_white_shark', 3),
 ('Achilles::Bongo_(antelope)', 2),
 ('Achilles_tendon::Great_white_shark', 2),
 ('Africa:

In [10]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "target/tmp/pythonSVMWithSGDModel")
sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")

ModuleNotFoundError: No module named 'numpy'

In [None]:
# Stop context
#spark.stop()