In [44]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import regexp_replace, trim, col, lower, concat
import re
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
conf=pyspark.SparkConf().setAppName('SparkApp1').setMaster('local')
sc=pyspark.SparkContext(conf=conf)
spark=SparkSession(sc)

I am  initializing a pyspark session, while reading the json data and selecting only two columns.

In [50]:
data = spark.read.json("reviews_devset.json", multiLine = "False").select("category", "reviewText")

## Data Preprocessing

Here i am selecting reviewText and splitting with the regex pattern. Then i am mapping for each category a word from reviewText.

In [51]:
delimiters = [' ', '(', ')', '[', ']', '{', '}', '.', '!', '?', ',', ';',':','+','=','-','_','"','`',"'",'~','#','@','&','*','%','€','$','§','/','0','1','2','3','4','5','6','7','8','9']

regexPattern = '|'.join(map(re.escape, delimiters))

df = data.select(data.category, data.reviewText)
df = df.withColumn("reviewText", split(col("reviewText"),  regexPattern))
SplitData=df.rdd.flatMap(lambda x: [(x['category'], w) for w in x["reviewText"]]).collect()


I am mapping ((the category , word), 1). I am spliting the words as tokens and making them lowercase. Then I am removing the stopwords in english and getting words with length > 0

In [52]:
stopwords = set(sc.textFile("stopwords.txt").collect())

new_df=sc.parallelize(SplitData)
rdd_new = new_df.map(lambda x: ((x[0],x[1]), 1) ) \
             .filter(lambda x: x[0][1].lower() not in stopwords and len(x[0][1])>0)\
             .reduceByKey(lambda x, y: x + y) \
             .map(lambda x: ("",[(x[0][0].lower(),x[0][1].lower(),x[1])]))


prevData stores the old data. We are creating aList and two dictionaries wordCounts and CatCounts.

In the for loop the category, word and the count are saved in a aList. **ct** gets the name of the category from this list and wrd gets the word from the list. Both of them are used in dictionary. **wordCounts[wrd]** stores the number for all the appearances of given word in all categories. **CatCounts[ct]** sums all the words in the category. **totalWords** counts the total amount of words. **count** gets the number of records from the aList

In [53]:
prevData=rdd_new.collect()
wrd=""
totalWords=0
aList= []
CatCounts = {}
wordCounts={}

for i,row in enumerate(prevData):
    aList.append(row[1][0])
    
    ct=aList[i][0]
    wrd=aList[i][1]
    
    if wrd in wordCounts:
        wordCounts[wrd] = wordCounts[wrd]+aList[i][2]    
    else:
        wordCounts[wrd] = aList[i][2]
        
    if ct in CatCounts:
        CatCounts[ct] = CatCounts[ct]+aList[i][2]    
    else:
        CatCounts[ct] = aList[i][2]
        
    totalWords+=row[1][0][2]

count = len(aList) 

In this for loop the category and word are taken again. "val" is used to calculate the expected value of the current word from the current category. In the variable "values" the chi^2 values are being calculated. The current variable is extracted from the expected and powered by 2. Then its divided by the expected value. The chi^2 value is saved in the list variables, which is used to concatenate the values from "variables" and the list with categories, the word and the count of the words. Then the list is sorted alphabetically by category and the values are sorted descending.

In [54]:
variables=[]
for i in range(0,count):
    ct=aList[i][0]
    wrd=aList[i][1]
    val=wordCounts[wrd]*CatCounts[ct]/totalWords
    values=pow((aList[i][2]-val),2)/val
    variables.append(values)

ListAll = []  
for aList, variables in zip(aList, variables):
    ListAll.append(aList + (variables,))
    
ListAll.sort(key=lambda row: (row[0], -row[3]))

In the next loop i am storing the first 75 chi values, sored by descending for each category. I am storing the data inside the "s" string varable.
If the first check is true then in ListPrintAllVals the name of the category and the "s" variable, in which is accumulated the words with their chi^2 values are saved. The second check is used to accumulate in "s" the top 75 terms with their values, as it was mentioned in the task. Also, the list "ListWords" is filled with all the words, found in these top 75, whithout duplicates.
When the loop finishes the ListWords is sorted alphabetically. In ListPrintAllValues is appended the list with the unique words, separated with " ". All the records from ListPrintAllValues are saved in output_rdd.txt

In [55]:
ListPrintAllVals=[]
ListWords=[]

oldct=""
if count>0:
    oldct=ListAll[0][0]
cntr=0
s=""
ct=""
for i in range(0,count):
            cntr=cntr+1
            ct=ListAll[i][0]
            if oldct!=ct:
                cntr=0
                ListPrintAllVals.append("<" + oldct +"> " + (s) + "")
                s=''
                oldct=ct
            if cntr<76:
                s =s + ListAll[i][1] + ":" +  str(ListAll[i][3]) + " "
                if ListAll[i][1] not in ListWords:
                    ListWords.append(ListAll[i][1])
                    
if s!='':
    ListPrintAllVals.append("<" + oldct +"> " + (s) + "")  
s=""      

ListWords.sort()
separator = " "

ListPrintAllVals.append(separator.join(map(str, ListWords)))

with open('output_rdd.txt', 'w') as filehandle:
     for item in ListPrintAllVals:
         filehandle.write('%s\n' % item)
        
