# Setup (Run this block first)

In [1]:
DATA_PATH = 'datascience.stackexchange.com'
import preprocessing
from pyspark import SparkContext
import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home' 
users = preprocessing.user_xml(f'{DATA_PATH}/Users.xml')
posts = preprocessing.post_xml(f'{DATA_PATH}/PostHistory.xml')

sc: SparkContext = SparkContext.getOrCreate()
user_rdd = sc.parallelize(users)
posts_rdd = sc.parallelize(posts)

# Question 1
__From the Users.xml file, find all users which are from Georgia and output to screen their DisplayName only.__

In [2]:
ga_users = user_rdd.filter(lambda s: (" GA" in s.Location or "Atlanta, Georgia" in s.Location)) 
for user in ga_users.collect():
    print(user.DisplayName)


Tony Boyles
pkerl
Nick Larsen
gfritz
Aleksandr Blekh
Michael
Ayush
azoorob
ontek
Aravind R. Yarram
ilya
Daisuke Aramaki
tempusfugit
Henry Crutcher
Goddard
Matt Simpson
Peter Woolfitt
matt biskup
Jason W
Peter Mourfield
Magsol
Bob Baxley
badjr
mplunney
YC Hu
ryan
Patrick Gerbes
Ilya Lapitan
Dan Anton
pradyumnad
Psidom
Teresa Madsen
Brandon
jpm
Mr. Rooter of Savannah
Mr. Rooter of Southeast GA
Khiem Ha
Jenna Kwon
Ahmet Cecen
Guy Gordon
C3Theo
niru dyogi
Vinitha Palani
Mac18
Andrew
Aditya Gogoi
turtlemonvh
Lewis Rodgers
Tarun Luthra
Devendra Lattu
cosmosa
Todd Dawson
Mboolean
Jimd
David F
PSInf
Chirag
Sandeep Gunda
hellofanengineer
Will Gao
Oriol Mirosa
Andrew King
David
rajb245
Sealander
afshin
Ashish Powani
Boris N.
Atul Kaushik
Harnoor Singh
Vincent
Tiago Cogumbreiro
cbarrick
Len Greski
red_eight
PEBKAC
Christoph
Bryce
David Hofmann
nburn42
donlan
Nick M
Kiran
BarclayK
Zer0k
dportman
Shishir Suman
Scott
JessicaRabi
Baxter
Rama Ananda
Dr. Strange
Shahan M
wgreenihrcorp
zongyan
Alexandre

# Question 2
__Using the Users.xml file, provide the count for all users which joined (CreationDate) in 2017. (30 points). Output this to the screen.__


In [3]:
users_2017 = user_rdd.filter(lambda s: "2017" in s.CreationDate)
print(f'{users_2017.count()} accounts created in 2017.')


14239 accounts created in 2017.


# Question 3
__Using the PostHistory file, count the number of Posts that feature the words “Spark” and “Scala”. Output this to the screen.__

In [4]:
filtered_posts = posts_rdd.filter(lambda s: "scala" in s.Text.lower() and "spark" in s.Text.lower())
print(f'{filtered_posts.count()} posts')


211 posts


# Question 4
__Using the PostHistory file, provide a total count of the words used by each distinct user. In other words, count all words in all posts for each user and display this to screen. You can only identify users by the UserID (30 points). You get 15 bonus points if you get the actual DisplayName of the user.__

In [5]:
from operator import add
display_names = user_rdd.map(lambda s: (s.Id, s.DisplayName))
post_counts = posts_rdd.map(lambda s: (s.UserId, len(s.Text.split())))
grouped_post_counts = post_counts.reduceByKey(add)
joined_rdd = grouped_post_counts.join(display_names).sortBy(lambda s: s[1][0], False)
print(f'{"UserId":<10}  {"DisplayName":<35}  WordCount')
for x in joined_rdd.collect():
    print(f'{x[0]:<10}  {x[1][1]:<35}  {x[1][0]}')

UserId      DisplayName                          WordCount
836         Neil Slater                          352120
67328       Esmailian                            200052
29575       Stephen Rauch                        196239
45264       n1k31t4                              145635
-1          Community                            125614
28175       Vaalizaadeh                          110133
43077       Kari                                 107538
29587       JahKnows                             83099
29169       Ethan                                67695
11097       Dawny33                              60274
8820        Martin Thoma                         57425
40853       Toros91                              49790
8878        Kasra Manshaei                       42751
381         Emre                                 42696
924         Anony-Mousse                         37619
71219       aranglol                             37187
4683        David Marx                           34245

16338       Breeze                               1212
21567       dirkster                             1212
77745       yaojp                                1210
12909       Jérémie Clos                         1209
525         Jake C.                              1206
50820       AndrewBharadwajKalahasti             1206
69689       Sympa                                1205
33934       kenny                                1205
61601       M Patel                              1204
63527       Slowpoke                             1204
52049       eddybear                             1201
61843       moz_szt                              1201
75549       Capeboom                             1199
19147       nir                                  1198
68075       Eduardo Martinez                     1198
73832       junmouse                             1198
37626       RAVI TEJA M                          1196
50449       uharsha33                            1196
51141       Tim             

26629       aranelladen                          485
9577        Srinivas K                           484
5377        thie1e                               484
27356       timekeeper                           484
26961       Dan Hicks                            484
70321       ItK                                  484
66736       Antoine Savine                       484
33072       Jesse Maher                          484
44605       aduguid                              484
22080       Merlin1896                           484
30381       Mary                                 484
41897       gammapoint                           484
59173       J. Domanski                          484
71172       JackWills                            484
73228       SorenA                               483
37278       question.time                        483
51719       Denis Candido                        483
49928       sisdog                               483
57328       Lila                              

19046       lucky6qi                             304
56906       Luxspes                              304
28562       Mr. Phil                             304
54722       Kieran Lavelle                       304
20513       Coeus2016                            303
10940       Kyle                                 303
21281       Juha                                 303
71425       Bissan                               303
76433       Novice Python charmer                303
77094       Abdul                                303
18713       OAK                                  303
70520       Dravidian                            303
10972       Michael                              303
76359       Jennifer Darrouzet                   303
45962       nick88                               303
10751       oopcode                              303
27189       Sean Moriarty                        303
51308       soeci92                              303
12497       Justin                            

50338       Thiago                               235
12626       Tristan Reid                         234
60756       Vinay Varma                          234
71974       Siddharth Singh                      234
74797       Esoemah                              234
273         Aldy syahdeini                       234
30538       dwolffram                            234
74718       Peterukk                             234
3506        rbk                                  234
69777       Andrew                               234
79947       MachuPichu                           234
14562       darksurfer                           234
67655       darksinge                            234
73164       Daniel Rivas                         234
18661       FreedomToWin                         234
58282       Pierre Pasquet                       234
21262       Constantin Weisser                   234
34037       user7677413                          234
51082       UHU                               

29656       Ruppesh Nalwaya                      177
30686       Ashish Sahu                          177
38672       Xyand                                177
46754       Sai Charan Adurthi                   177
58528       Carlo Pazolini                       177
39380       Ayush Chaurasia                      177
77891       Tetro                                177
4956        Mark                                 177
14752       craighagerman                        177
44004       Xraycat922                           177
45570       Maz                                  177
60080       user60080                            177
67532       maro                                 177
78047       DS_GB                                177
79777       Gaurav Koradiya                      177
80032       Ali Kılınç                           177
57164       Salvador Gutierrez                   177
62400       Annie Shtok                          177
67583       Tushar Mehta                      

14972       David Rose                           150
16743       Alex                                 150
25297       rameshoswal                          150
58551       JPCT                                 150
52601       Moreno                               150
79704       Naseef Ur Rahman                     150
45679       thisisbhavin                         150
56177       Super_John                           150
58398       Pablo Gonzalez                       150
63735       user10411263                         150
64720       Arpit Kathuria                       150
18593       Anthony Gatlin                       150
14578       David Dao                            149
38446       Yuri                                 149
13671       Chris B-C                            149
56463       I_am_rahul                           149
67164       Martin Ferianc                       149
77101       uğur yıldırım                        149
10365       Harshvardhan Solanki              

49697       Ermene                               118
57953       Ilia Kandrashou                      118
62915       Nikolaos Paschos                     118
79333       Shlee                                118
68786       thegravity                           118
76333       MSS                                  118
76934       Jo Makintash                         118
78821       A_the_kunal                          118
8626        edgaralienfoe                        118
8839        Rejeena                              118
14391       Vinay                                118
38019       JimReno                              118
44560       Tina J                               118
62625       Cemre                                118
65267       Yannik Suhre                         118
69577       Enissay                              118
57936       user3280146                          118
53215       Mohammed Obeidat                     118
59791       lfelipesv                         

73062       Niklas Raab                          75
37340       Waterbyte                            75
47802       Richard                              75
46452       frank                                75
10322       user10322                            75
49923       Ethereal                             75
49758       Jay                                  75
57290       bison72                              75
57833       Nirmal Roy                           75
61220       Alex                                 75
70845       mister lee                           75
73191       user13432                            75
41033       Alon Gelber                          75
53734       sebjwallace                          75
65454       Umesh                                75
32024       OcK                                  75
76244       sdaylor                              75
1333        nograpes                             75
9234        economy                              75
15373       

35654       S_Ymln                               49
54555       김동규                                  49
76242       Gavin Fitzpatrick                    49
78406       Fakrudeen                            49
49762       yu.sun                               49
29068       ntzortzis                            49
66472       Ali Mirzaei                          49
67864       Rahul                                49
59384       Geek Girl                            49
17780       mkelley82                            49
29749       Abhis                                49
40760       Rafael Posada                        49
42825       Boris                                49
57321       Fahim                                49
69621       Arshad_221b                          49
70153       Karsten                              49
71240       Darpan Dahal                         49
79636       Ben                                  49
19258       Dariush                              48
29422       

46478       Valeriy K.                           10
20981       nachti                               10
729         user40465                            10
65327       Aron Grzywaczewski                   9
41571       niths4u                              9
62211       Rossi                                9
34265       Ameet Deshpande                      9
68714       csaladenes                           9
41389       Arthur Fortes                        9
32749       magui                                9
45729       Abhishek Singh                       9
79864       Abhimanyu                            9
58560       duckmayr                             9
49470       user1302884                          8
10305       vkp                                  8
13315       Maikel Leyva                         8
54075       DataBender                           8
3586        Mieszko                              8
168         Mr Lister                            7
21663       abhicantdraw    

# Question 5
__Using the users.xml, comments.xml and PostHistory.xml files, produce a single file that includes the following information: DisplayName, Number of Comments, total Score and Number of posts. This file should have the users (DisplayName) sorted by score, descending from higher to lower.__

In [7]:
from operator import add
comments = preprocessing.comments_xml(f'{DATA_PATH}/Comments.xml')
comments_rdd = sc.parallelize(comments)
mapped_rdd = comments_rdd.map(lambda s: (s.UserId, int(s.Score)))

comment_counts = mapped_rdd.countByKey()
cc_rdd = sc.parallelize([(k, v) for k, v in comment_counts.items()])
score_rdd = mapped_rdd.foldByKey(0, add)

post_counts = posts_rdd.map(lambda s: (s.UserId, s.Id)).countByKey()
pc_rdd = sc.parallelize([(k, v) for k, v in post_counts.items()])

final_rdd = pc_rdd.join(cc_rdd).join(score_rdd).join(display_names)

final_rdd = final_rdd.mapValues(lambda v: (v[0][0][0], v[0][0][1], v[0][1], v[1])).sortBy(lambda s: s[1][2], False)
with open('question5.csv', 'w') as fp:
    fp.write('UserId, DisplayName, PostCount, CommentCount, TotalScore\n ')
    for row in final_rdd.collect():
        fp.write(f'{row[0]}, {row[1][3]}, {row[1][0]}, {row[1][1]}, {row[1][2]}\n')
    
