# Collect and processing (Batch)  on the popularity of the candidates
Occurrence of a candidate in a pdf file. Use of the PyPdf2 library to read the file and extract the occurrences of the candidates' names.

#### Importation of the necessary libraries

In [1]:
import os
from datetime import datetime
from urllib.request import urlopen, urlretrieve
import PyPDF2 as p
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession

# File path
file = 'file_occurence.pdf'

## Collect
#### Download and storage a pdf file (collecte and storage)
With the beautifulSoup library of the bs4 library, we scrape the html page to get the download link; and then we download the file

In [2]:
url = 'https://search.usa.gov/search?utf8=%E2%9C%93&affiliate=usagov&query=elections+pr%C3%A9sidentielles'
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
h4 = [elt for elt in soup.findAll("h4") if 'ÉLECTIONS' in elt.getText()][0]
a = [elt for elt in h4.findAll('a') if 'ÉLECTIONS' in elt.getText()][0]
link = a['href']
urlretrieve(link, file)

('file_occurence.pdf', <http.client.HTTPMessage at 0x194c433cd88>)

#### Display the informations of the pdf's download

In [3]:
info = os.stat(file)
print(info)
print("size (Mo): ", round(info.st_size/1000000, 3))
print("last_update: ", datetime.fromtimestamp(info.st_ctime))

os.stat_result(st_mode=33206, st_ino=8162774325066304, st_dev=2721925331, st_nlink=1, st_uid=0, st_gid=0, st_size=1661914, st_atime=1619612625, st_mtime=1619612625, st_ctime=1619612625)
size (Mo):  1.662
last_update:  2021-04-28 14:23:45.481010


#### Read and explore the pdf's file

In [4]:
a = p.PdfFileReader(file)
texte = " "
for i in range(0, a.getNumPages()):
    texte = texte + a.getPage(i).extractText()
print(a.getPage(0).extractText())

USA
ÉLECTIONS
en bref



## Batch processing
#### Creation of the spark session

In [5]:
spark = SparkSession.builder.appName("Candidate's occurences").config("spark.driver.bindAddress",
                                                                      "127.0.0.1").getOrCreate()

#### Initialization of the context

In [6]:
sc = spark.sparkContext

#### Parallelization of processing on the file

In [7]:
file_pdf = sc.parallelize(texte)

#### Evaluation function that returns 0 if the name of a candidate appears in the file and 1 otherwise

In [8]:
ListCandidats = ["Trump", "Biden"]

def mapEvaluation(word):
    return (word, int(not word in ListCandidats))

#### Batch processing with spark

In [9]:
C_wordcount = file_pdf.flatMap(lambda line: line.split(' ')).map(mapEvaluation).reduceByKey(
    lambda count1, count2: count1 + count2).collect()
for (word, count) in C_wordcount:
     print(word, count)

 21970
L 195
s 5129
l 3205
c 2116
r 3686
i 4055
b 451
p 1725
g 626
™ 571
y 130
N 91
O 53
J 14
4 37
; 12
É 32
« 19
K 6
[ 1
ï 1
e 7942
a 3971
m 1249
x 274
˜ 117
Q 7
è 211
U 91
7 33
â 13
T 85
B 39
) 51
3 54
M 51
A 149
ù 12
F 27
H 19
ô 19
6 45
% 7
z 14
î 9
w 20
À 4
˛ 2
d 2702

 1494
à 311
h 261
. 431
1 143
û 7
C 181
8 56
R 70
0 150
S 132
» 19
j 77
9 53
Ô 1
? 2
W 18
œ 3
© 4
é 1647
t 4559
u 2991
n 4508
q 467
o 3125
v 650
f 410
D 97
ê 80
E 291
, 534
ç 13
: 21
˚ 17
- 253
Œ 37
I 123
P 76
k 26
( 51
2 109
5 44
X 20
G 32
V 20
Y 7
È 4
/ 5
] 1
