In [1]:
from __future__ import print_function
import re
import sys
import numpy as np
import math
from pyspark import SparkContext

In [None]:
sc = SparkContext()

In [5]:
def buildArray(listOfIndices):
    returnVal = np.zeros(20000)
    for index in listOfIndices:
        returnVal[index] = returnVal[index] + 1
    mysum = np.sum(returnVal)
    returnVal = np.divide(returnVal, mysum)
    return returnVal


##for the training data set
d_corpus =  sc.textFile("SmallTrainingData.txt")
d_keyAndText = d_corpus.map(lambda x: (x[x.index('id="') + 4: x.index('" url=')], x[x.index('">') + 2:][:-6]))
regex = re.compile('[^a-zA-Z]')

d_keyAndListOfWords = d_keyAndText.map(lambda x: (str(x[0]), regex.sub(' ', x[1]).lower().split()))

allWords = d_keyAndListOfWords.map(lambda x: x[1]).flatMap(lambda x: x).map(lambda x: (x, 1))
allCounts = allWords.reduceByKey(lambda x, y: x + y)
topWords = allCounts.top(20000, lambda x: x[1])

print("Top Words in Corpus:", allCounts.top(10, key=lambda x: x[1]))

topWordsK = sc.parallelize(range(20000))

#creating dictionary for the corpus
dictionary = topWordsK.map(lambda x: (topWords[x][0], x))

print("Word Positions in our Feature Matrix. Last 20 words in 20k positions: ", dictionary.top(20, lambda x: x[1]))


Top Words in Corpus: [('the', 447995), ('of', 205873), ('and', 177099), ('in', 158991), ('to', 144197), ('a', 125478), ('was', 66687), ('for', 52673), ('on', 50806), ('s', 50308)]
Word Positions in our Feature Matrix. Last 20 words in 20k positions:  [('morelos', 19999), ('odisha', 19998), ('chittagong', 19997), ('sirte', 19996), ('fiqh', 19995), ('sajjada', 19994), ('cra', 19993), ('restarted', 19992), ('severus', 19991), ('ngan', 19990), ('magnolia', 19989), ('hornets', 19988), ('symmetric', 19987), ('abolitionist', 19986), ('payoff', 19985), ('lehmann', 19984), ('qasim', 19983), ('dreyer', 19982), ('harkleroad', 19981), ('cain', 19980)]


In [7]:
#creating term frequency doc
allWordsWithDocID = d_keyAndListOfWords.flatMap(lambda x: ((j, x[0]) for j in x[1]))
allDictionaryWords = dictionary.join(allWordsWithDocID)
allDictionaryWords.take(4)

[('of', (1, 'AU35')),
 ('of', (1, 'AU35')),
 ('of', (1, 'AU35')),
 ('of', (1, 'AU35'))]

In [8]:
justDocAndPos = allDictionaryWords.map(lambda x: x[1]).map(lambda x: (x[1], x[0]))
allDictionaryWordsInEachDoc = justDocAndPos.groupByKey()
allDocsAsNumpyArrays = allDictionaryWordsInEachDoc.map(lambda x: (x[0], buildArray(x[1])))

allDocsAsNumpyArrays.take(3)

[('AU35',
  array([0.11425061, 0.06511057, 0.02272727, ..., 0.        , 0.        ,
         0.        ])),
 ('AU85',
  array([0.11214496, 0.0553379 , 0.03770813, ..., 0.        , 0.        ,
         0.        ])),
 ('AU124',
  array([0.07149577, 0.03857008, 0.01034807, ..., 0.        , 0.        ,
         0.        ]))]

In [9]:
#classifying 1 for AU docs and 0 for wiki pages
myRDD = allDocsAsNumpyArrays.map(lambda x: (x[1], 1 if x[0].startswith('AU') else 0)).map(lambda x: (x[1], x[0]))

#getting size
size = myRDD.count()

myRDD.cache()


PythonRDD[42] at RDD at PythonRDD.scala:53

In [12]:
print('Logistic Regression with Regularization')

#initializing parameters
num_iteration = 200
beta = np.zeros(20000)
learningRate = 0.1
precision = 0.01
cost_all = []
l_val = .00001
cost = 0.0


Logistic Regression with Regularization


In [13]:
for i in range(num_iteration):

    #implementation of the logistic regression
    gradientCost = myRDD.map(lambda x: (x[0], x[1], np.dot(x[1], beta))) \
        .map(lambda x: (-x[1] * x[0] + x[1] * (math.exp(x[2]) / (1 + math.exp(x[2]))), \
                        -x[0] * x[2] + math.log(1 + math.exp(x[2])))) \
        .reduce(lambda x, y: (x[0] + y[0], x[1] + y[1]))

    #cost
    cost = gradientCost[1]

    #gradient with l2 regularization
    gradient = gradientCost[0] + (2 * l_val * beta)

    print(i, "Beta: ", beta, " Cost: ", cost)
    beta = beta - learningRate * gradient

    cost_all.append(cost)

    if i != 0:

        cost_current = cost_all[i]
        cost_previous = cost_all[i - 1]

        if np.abs(cost_current - cost_previous) < precision:
            break



0 Beta:  [0. 0. 0. ... 0. 0. 0.]  Cost:  2385.8125954873367
1 Beta:  [-1.28972234e+01 -6.02118495e+00 -5.46464762e+00 ... -1.17624357e-03
 -9.63554092e-04 -4.85139152e-04]  Cost:  636.7181733349906
2 Beta:  [-1.54583449e+01 -7.22892814e+00 -6.74973701e+00 ... -1.53390900e-03
 -1.20275519e-03 -5.70514536e-04]  Cost:  524.7569967634022
3 Beta:  [-1.70306412e+01 -7.97334326e+00 -7.61449909e+00 ... -1.79661409e-03
 -1.37328425e-03 -6.25093251e-04]  Cost:  477.2530369482495
4 Beta:  [-1.81325964e+01 -8.49686255e+00 -8.27621797e+00 ... -2.01154223e-03
 -1.51063912e-03 -6.66063524e-04]  Cost:  451.6337847384746
5 Beta:  [-1.89558236e+01 -8.88926073e+00 -8.81635417e+00 ... -2.19712011e-03
 -1.62807359e-03 -6.99362931e-04]  Cost:  435.9812808189194
6 Beta:  [-1.95941510e+01 -9.19456373e+00 -9.27519446e+00 ... -2.36271288e-03
 -1.73214623e-03 -7.27761903e-04]  Cost:  425.6203490327032
7 Beta:  [-2.01009500e+01 -9.43784413e+00 -9.67582835e+00 ... -2.51379735e-03
 -1.82662321e-03 -7.52778118e-04] 

60 Beta:  [-2.00159941e+01 -9.54529631e+00 -1.84959598e+01 ... -7.27163628e-03
 -4.75745992e-03 -1.46006532e-03]  Cost:  359.02663135131945
61 Beta:  [-1.99470140e+01 -9.51437996e+00 -1.86244791e+01 ... -7.35008228e-03
 -4.80613638e-03 -1.47239602e-03]  Cost:  358.298862745553
62 Beta:  [-1.98781162e+01 -9.48346566e+00 -1.87527514e+01 ... -7.42840277e-03
 -4.85475947e-03 -1.48474629e-03]  Cost:  357.57283464182245
63 Beta:  [-1.98093147e+01 -9.45256029e+00 -1.88807834e+01 ... -7.50659907e-03
 -4.90333019e-03 -1.49711630e-03]  Cost:  356.8485414094935
64 Beta:  [-1.97406223e+01 -9.42167006e+00 -1.90085810e+01 ... -7.58467238e-03
 -4.95184946e-03 -1.50950619e-03]  Cost:  356.1259777304253
65 Beta:  [-1.96720506e+01 -9.39080064e+00 -1.91361495e+01 ... -7.66262371e-03
 -5.00031811e-03 -1.52191607e-03]  Cost:  355.40513854700237
66 Beta:  [-1.96036097e+01 -9.35995713e+00 -1.92634941e+01 ... -7.74045398e-03
 -5.04873686e-03 -1.53434602e-03]  Cost:  354.68601901954264
67 Beta:  [-1.95353093e+

119 Beta:  [-1.62561650e+01 -7.81153119e+00 -2.57423132e+01 ... -1.16971133e-02
 -7.54968055e-03 -2.21978492e-03]  Cost:  318.93398900939945
120 Beta:  [-1.61986561e+01 -7.78421769e+00 -2.58597903e+01 ... -1.17685541e-02
 -7.59565220e-03 -2.23314779e-03]  Cost:  318.3023467171487
121 Beta:  [-1.61413518e+01 -7.75697596e+00 -2.59770951e+01 ... -1.18398733e-02
 -7.64157804e-03 -2.24652343e-03]  Cost:  317.6722404363246
122 Beta:  [-1.60842516e+01 -7.72980592e+00 -2.60942278e+01 ... -1.19110709e-02
 -7.68745800e-03 -2.25991168e-03]  Cost:  317.0436675320904
123 Beta:  [-1.60273550e+01 -7.70270749e+00 -2.62111884e+01 ... -1.19821467e-02
 -7.73329203e-03 -2.27331236e-03]  Cost:  316.4166253876826
124 Beta:  [-1.59706613e+01 -7.67568061e+00 -2.63279769e+01 ... -1.20531007e-02
 -7.77908006e-03 -2.28672528e-03]  Cost:  315.791111404123
125 Beta:  [-1.59141701e+01 -7.64872517e+00 -2.64445936e+01 ... -1.21239326e-02
 -7.82482202e-03 -2.30015027e-03]  Cost:  315.16712299993145
126 Beta:  [-1.5857

178 Beta:  [-1.31954795e+01 -6.31994149e+00 -3.23809922e+01 ... -1.57014457e-02
 -1.01815795e-02 -3.02437559e-03]  Cost:  284.2163517265168
179 Beta:  [-1.31491130e+01 -6.29670583e+00 -3.24884198e+01 ... -1.57655947e-02
 -1.02247419e-02 -3.03819597e-03]  Cost:  283.67128431995275
180 Beta:  [-1.31029196e+01 -6.27353629e+00 -3.25956788e+01 ... -1.58296195e-02
 -1.02678550e-02 -3.05201913e-03]  Cost:  283.1276192489856
181 Beta:  [-1.30568986e+01 -6.25043276e+00 -3.27027693e+01 ... -1.58935202e-02
 -1.03109186e-02 -3.06584490e-03]  Cost:  282.5853544722292
182 Beta:  [-1.30110495e+01 -6.22739513e+00 -3.28096912e+01 ... -1.59572967e-02
 -1.03539327e-02 -3.07967312e-03]  Cost:  282.04448795077775
183 Beta:  [-1.29653720e+01 -6.20442331e+00 -3.29164448e+01 ... -1.60209491e-02
 -1.03968973e-02 -3.09350365e-03]  Cost:  281.5050176479883
184 Beta:  [-1.29198653e+01 -6.18151717e+00 -3.30230300e+01 ... -1.60844775e-02
 -1.04398123e-02 -3.10733632e-03]  Cost:  280.966941529265
185 Beta:  [-1.2874

In [29]:
 #getting index, and printing words with highest regression coefficients
index = np.argpartition(beta, -5)[-5:]
print('Index', index)
print('Values', beta[index])
print('Words with highest regression coefficients', dictionary.filter(lambda x: x[1] in index).collect())

#for testing data set creating tf (similar to training)
d_corpus_test = sc.textFile('SmallTrainingData.txt')
d_keyAndText_test = d_corpus_test.map(
    lambda x: (x[x.index('id="') + 4: x.index('" url=')], x[x.index('">') + 2:][:-6]))
regex = re.compile('[^a-zA-Z]')


d_keyAndText_test.take(2)

Index [485 149  12 346  28]
Values [ 6.29173678  6.67581636 16.05155191  7.3915079   7.45304298]
Words with highest regression coefficients [('that', 12), ('not', 28), ('court', 149), ('applicant', 346), ('tribunal', 485)]


[('AU35',
  "consideration of an application for a stay pending an appeal.native titleI have before me an application by notice of motion filed on 30 June 2006 by which the applicant seeks an order that two orders of his Honour Justice Dowsett made on 6 October 2005 and 19 June 2006 be stayed pending the determination of an appeal to the Full Court of this Court.On 6 October 2005, his Honour made an order that 'the applicant file and serve an amended application on or before 14 October 2005, in default thereof the application stands dismissed' .His Honour also ordered that the Native Title Registrar contact the applicant to coordinate continued negotiations between the applicant group and other claimant groups in relation to overlapping claims concerning claims made by members of the Wiri People in connection with 'Wiri country' defined by reference to particular claim boundaries.His Honour also ordered that the matter otherwise be adjourned to 31 March 2006 at 10.15am.On 19 June 2006,

In [30]:
d_keyAndListOfWords_test = d_keyAndText_test.map(lambda x: (str(x[0]), regex.sub(' ', x[1]).lower().split()))
allWordsWithDocID_test = d_keyAndListOfWords_test.flatMap(lambda x: ((j, x[0]) for j in x[1]))

In [31]:
allDictionaryWords_test = dictionary.join(allWordsWithDocID_test)

In [32]:
justDocAndPos_test = allDictionaryWords_test.map(lambda x: x[1]).map(lambda x: (x[1], x[0]))
justDocAndPos_test.take(2)

[('AU35', 1), ('AU35', 1)]

In [33]:
allDictionaryWordsInEachDoc_test = justDocAndPos_test.groupByKey()
allDocsAsNumpyArrays_test = allDictionaryWordsInEachDoc_test.map(lambda x: (x[0], buildArray(x[1])))

#creating relevant labels for test data set as well
myRDD_test = allDocsAsNumpyArrays_test.map(lambda x: (x[1], 1 if x[0].startswith('AU') else 0, x[0])).map(
    lambda x: (x[1], x[0], x[2]))

In [34]:
myRDD_test.take(2)

[(1,
  array([0.11425061, 0.06511057, 0.02272727, ..., 0.        , 0.        ,
         0.        ]),
  'AU35'),
 (1,
  array([0.11214496, 0.0553379 , 0.03770813, ..., 0.        , 0.        ,
         0.        ]),
  'AU85')]

In [35]:
#using maximum value using y = 0 and y = 1
lr = myRDD_test.map(lambda x: (x[0], np.dot(x[1], beta), x[2])) \
    .map(lambda x: (x[0], x[1], x[1] - (math.log(1 + math.exp(x[1]))), -(math.log(1 + math.exp(x[1]))), x[2]))

#predicting the test data
test_pred = lr.map(lambda x: (x[0], 1 if x[2] >= x[3] else 0, x[4]))

#false positive
fp = test_pred.filter(lambda x: ((x[0] == 0) & (x[1] == 1)))

# #three false positive documents IDs
# list_fp = fp.map(lambda x: x[2]).take(3)
# print('Three False Positive Doc IDs: ', list_fp)

#printing the documents which were classified as false positive
# fp_docs = d_keyAndListOfWords_test.filter(lambda x: x[0] in list_fp).take(3)




In [36]:
test_pred.take(2)

[(1, 0, 'AU35'), (1, 0, 'AU85')]

In [38]:
#calculating the confusion matrix
tp = test_pred.filter(lambda x: ((x[0] == 1) & (x[1] == 1))).count()
fp = test_pred.filter(lambda x: ((x[0] == 0) & (x[1] == 1))).count()
fn = test_pred.filter(lambda x: ((x[0] == 1) & (x[1] == 0))).count()
tn = test_pred.filter(lambda x: ((x[0] == 0) & (x[1] == 0))).count()
print('TP: ', tp)
print('FP: ', fp)
print('FN: ', fn)
print('TN: ', tn)

#calculating precision, recall and fscore
fscore = 0
recall = 0
precision = 0
if ((tp + fp) != 0) & ((tp + fn) != 0):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = (2 * precision * recall) / (precision + recall)
print('Precision: ', precision)
print('Recall: ', recall)
print('F-Score: ', fscore)

sc.stop()

TP:  0
FP:  0
FN:  74
TN:  3368
Precision:  0
Recall:  0
F-Score:  0


The training is done on a small data set, and the learning is incomplete. This is a underfitting problem which leads to low training and testing accuracies. The word file includes the results on performing on the Google Cloud (GCP). 