In [105]:
from pyspark.sql import SparkSession

from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import MinMaxScaler
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
import numpy as np
import random
from pyspark.sql.functions import udf
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import pyspark
print(pyspark.__version__)
import re

2.4.4


In [5]:
spark = SparkSession.builder.master("local[*]").appName("Assign1").getOrCreate()    
spark.conf.set("spark.executor.memory", '30g')
spark.conf.set("spark.driver.memory",'30g')

### Part 1

In [10]:
AmazonDF = spark.read.csv("Amazon.csv",header=True,sep=",",inferSchema=True).limit(100)
GoogleDF = spark.read.csv("Google.csv",header=True,sep=",",inferSchema=True).limit(100)

In [12]:
sparkDF.rdd.take(2)

[Row(id='b000jz4hqo', title='clickart 950 000 - premier image pack (dvd-rom)', description=None, manufacturer='broderbund', price=0.0),
 Row(id='b0006zf55o', title='ca international - arcserve lap/desktop oem 30pk', description='oem arcserve backup v11.1 win 30u for laptops and desktops', manufacturer='computer associates', price=0.0)]

### Part 2

#### (a) Implement a function that takes a string and returns non-empty tokens by splitting using regular expressions.


In [79]:
def tokenize(string,stopWordsList):
    
    words = re.split('\W+|\s|\.',string)
    
    words = [word for word in words if word not in stopWordsList and word != '']
    
    
    return words

tokenize('he hello know','')

['he', 'hello', 'know']

In [80]:
stopWordsList =  spark.read.csv("stopwords.txt",header=True,sep='\n',inferSchema=True).collect()
stopWordsList = [i['!!'] for i in stopWordsList]

In [101]:
GoogleDF = GoogleDF.filter(GoogleDF.description.isNotNull())
googleTokens = GoogleDF.select('id','description').rdd.map(lambda x: (x['id'],tokenize(x['description'],stopWordsList)))
googleTokensDf = spark.createDataFrame(googleTokens, ["id", "tokenised_description"])
googleTokensDf.show()

+--------------------+---------------------+
|                  id|tokenised_description|
+--------------------+---------------------+
|http://www.google...| [learning, quickb...|
|http://www.google...| [fun, reading, wr...|
|http://www.google...| [qb, pos, 6, 0, b...|
|http://www.google...| [save, spectacle,...|
|http://www.google...| [adobe, cs3, prod...|
|http://www.google...| [corel, video, st...|
|http://www.google...| [whether, working...|
|http://www.google...| [qb, pos, 6, 0, p...|
|http://www.google...| [quickbooks, cred...|
|http://www.google...| [sony, media, sof...|
|http://www.google...| [qb, pos, 6, 0, p...|
|http://www.google...| [decide, fate, ga...|
|http://www.google...| [based, tween, li...|
|http://www.google...| [cisco, systems, ...|
|http://www.google...| [wasp, bar, code,...|
|http://www.google...| [axis, communicat...|
|http://www.google...| [hp, eu063av, aba...|
|http://www.google...| [ibm, bb0gyna, us...|
|http://www.google...| [equisys, eqzfn07...|
|http://ww

In [103]:
AmazonDF = AmazonDF.filter(AmazonDF.description.isNotNull())
amazonTokens = AmazonDF.select('id','description').rdd.map(lambda x: (x['id'],tokenize(x['description'],stopWordsList)))
amazonTokensDf = spark.createDataFrame(amazonTokens, ["id", "tokenised_description"])
amazonTokensDf.show()

+----------+---------------------+
|        id|tokenised_description|
+----------+---------------------+
|b0006zf55o| [oem, arcserve, b...|
|b000g80lqo| [peachtree, premi...|
|b0006se5bq| [singing, coach, ...|
|b000ehpzv8| [emc, retrospect,...|
|b00021xhzw| [upgrade, install...|
|b000gzwjgc| [marketing, infor...|
|b0000dbykm| [mia, s, math, ad...|
|b00029bqa2| [disney, s, 1st, ...|
|b0007prnjo| [many, times, hea...|
|b000aazr5i| [marketing, infor...|
|b000bhl1r8| [sql, server, com...|
|b00006hmwc| [reference, domin...|
|b00006hvvo| [today, enterpris...|
|b0000ycfcw| [topics, presents...|
|b00002sac9| [now, featuring, ...|
|b000bcz8ng| [world, book, enc...|
|b000fm18vi| [chord, display, ...|
|b00009apna| [complete, easy, ...|
|b0009rgzgm| [use, computer, r...|
|b000o24l3q| [note, upgrade, v...|
+----------+---------------------+
only showing top 20 rows



In [107]:
hashingTF = HashingTF(inputCol="tokenised_description", outputCol="rawFeatures")
featurizedData = hashingTF.transform(amazonTokensDf)
featurizedData.collect()

[Row(id='b0006zf55o', tokenised_description=['oem', 'arcserve', 'backup', 'v11', '1', 'win', '30u', 'laptops', 'desktops'], rawFeatures=SparseVector(262144, {39881: 1.0, 55505: 1.0, 100687: 1.0, 114301: 1.0, 120497: 1.0, 208651: 1.0, 228780: 1.0, 231924: 1.0, 236232: 1.0})),
 Row(id='b000g80lqo', tokenised_description=['peachtree', 'premium', 'accounting', 'nonprofits', '2007', 'affordable', 'easy', 'use', 'accounting', 'solution', 'provides', 'donor', 'grantor', 'management', 're', 'like', 'nonprofit', 'organizations', 're', 'constantly', 'striving', 'maximize', 'every', 'dollar', 'annual', 'operating', 'budget', 'financial', 'reporting', 'programs', 'funds', 'advanced', 'operational', 'reporting', 'rock', 'solid', 'core', 'accounting', 'features', 'made', 'peachtree', 'choice', 'hundreds', 'thousands', 'small', 'businesses', 'result', 'accounting', 'solution', 'tailor', 'made', 'challenges', 'operating', 'nonprofit', 'organization', 'keep', 'audit', 'trail', 'record', 'report', 'chan