

## Podstawy Spark

In [2]:
from pyspark.ml.classification import NaiveBayes #lepsze niż mllib bo można na df zamiast rdd
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import bs4#beautiful soup
import pyspark
sc = pyspark.SparkContext(appName="DBstackex")

## Text processing

Będziemy analizować ["The Tragedy of Titus Andronicus" by William Shakespeare](http://www.gutenberg.org/cache/epub/1106/pg1106.txt) z Project Gutenberg.

Zbiór nie jest Big Data ale ilustruje koncepcję przetwarzania w Spark.

In [3]:
lines = sc.textFile("data/Posts.xml")

In [5]:
lines.count()

127215

In [6]:
lines.take(5)

['<?xml version="1.0" encoding="utf-8"?>',
 '<posts>',
 '  <row Id="1" PostTypeId="1" CreationDate="2011-01-03T20:46:02.927" Score="134" ViewCount="122688" Body="&lt;p&gt;What are the main differences between InnoDB and MyISAM?&lt;/p&gt;&#xA;" OwnerUserId="8" LastActivityDate="2017-03-09T13:33:47.627" Title="What are the main differences between InnoDB and MyISAM?" Tags="&lt;mysql&gt;&lt;innodb&gt;&lt;myisam&gt;" AnswerCount="10" CommentCount="2" FavoriteCount="90" />',
 '  <row Id="2" PostTypeId="1" AcceptedAnswerId="4" CreationDate="2011-01-03T20:46:32.393" Score="61" ViewCount="7436" Body="&lt;p&gt;What version control methodologies help teams of people track database schema changes?&lt;/p&gt;&#xA;" OwnerUserId="7" LastEditorUserId="97" LastEditDate="2011-01-06T11:25:12.520" LastActivityDate="2013-09-23T07:14:01.600" Title="How can a group track database schema changes?" Tags="&lt;mysql&gt;&lt;version-control&gt;&lt;schema&gt;" AnswerCount="5" CommentCount="11" FavoriteCount="33" />

In [49]:
# zmieniamy linie w kolekcje słów
words = lines.flatMap(lambda x: x.split())#returns list of words
#words = lines.map(lambda x: x.split()) - returns list of lists

In [50]:
words.take(10)

['This',
 'Etext',
 'file',
 'is',
 'presented',
 'by',
 'Project',
 'Gutenberg,',
 'in',
 'cooperation']

In [51]:
# liczba słów
words.count()

23531

In [8]:
sqlContext = pyspark.sql.SQLContext(sc)

In [19]:
def parse_tags(line):
    l=line.replace("<row ","").replace(" />","").strip().split()
    d={}
    for e in l:
        k,v = e.split('=')
        d[k] =v
    return d

In [None]:
def parse_tags_gen(line):
    l=line.replace("<row ","").replace(" />","").strip().split()
    d={}
    for e in l:
        k,v = e.split('=')
        d[k] =v
    return d

In [20]:
tags_rdd=sc.textFile('data/Tags.xml')\
    .filter(lambda l: l.strip().startswith("<row"))\
    .map(parse_tags)

In [22]:
tags_rdd.take(5)

[{'Count': '"13032"',
  'ExcerptPostId': '"2667"',
  'Id': '"1"',
  'TagName': '"mysql"',
  'WikiPostId': '"2666"'},
 {'Count': '"1492"',
  'ExcerptPostId': '"3131"',
  'Id': '"2"',
  'TagName': '"innodb"',
  'WikiPostId': '"3130"'},
 {'Count': '"374"',
  'ExcerptPostId': '"3144"',
  'Id': '"3"',
  'TagName': '"myisam"',
  'WikiPostId': '"3143"'},
 {'Count': '"619"',
  'ExcerptPostId': '"4367"',
  'Id': '"4"',
  'TagName': '"schema"',
  'WikiPostId': '"4366"'},
 {'Count': '"242"',
  'ExcerptPostId': '"13428"',
  'Id': '"6"',
  'TagName': '"nosql"',
  'WikiPostId': '"13427"'}]

In [18]:
tags_rdd.map(lambda d: d.keys()).take(10)

NameError: name 'tags_rdd' is not defined

In [None]:
tags_cols = tags_rdd.flatMap(lambda d: d.keys()).distinct().take(100)

In [None]:
def tags_row(d):
    dd={}
    for k in tags_cols:
        if k in d:
            dd[k] = d[k]
        else:
            dd[k] = None
    return pyspark.Row(**dd)

In [None]:
tags=tags_rdd.map(tags_row).toDF()
tags.show()

In [52]:
# Liczymy słowa tylko z pierwszą wielką literą
capitalized = words \
  .filter(lambda x: x[0].isupper()) \

In [53]:
capitalized.take(5)

['This', 'Etext', 'Project', 'Gutenberg,', 'World']

In [54]:
capitalized.distinct().take(10)

['Unto',
 'Andronicus:',
 'Andronicus.',
 'Lavina',
 'Public',
 'LIBRARY,',
 'Get',
 'King,',
 'News,',
 'Complots']

In [55]:
capitalized \
  .map(lambda x: (x, 1)) \
  .reduceByKey(add) \
  .top(10, lambda x: x[1])  # możemy wybrać wartość z pary klucz-wartość jako element sortujący

[('I', 390),
 ('And', 289),
 ('TITUS.', 118),
 ('The', 90),
 ('To', 85),
 ('That', 84),
 ('MARCUS.', 65),
 ('But', 61),
 ('AARON.', 58),
 ('For', 55)]

### Zadania

* Wypisz 5 linii zaczynających się od "Titus" lub "Marcus" (usuwając spacje jeżeli potrzeba).
* Wypisz 20 najpopularniejszych słów z samymi WIELKIMI LITERAMI.
* ★ Jaka jestczęstotliwość wyrazów w dziele?

### (Python) hints

In [6]:
"  some string with whitespaces \t  ".strip()

'some string with whitespaces'

In [7]:
"Jake likes his dog.".startswith("Anne")

False

In [8]:
"Jake likes his dog.".startswith("Jake")

True

In [7]:
"Anne" or "Jake"  # Nie rób: string.startswith(a or b)

'Anne'

In [10]:
"abc,-".replace(",", "")

'abc-'

In [11]:
"abc,-".replace(",", "").replace("-", "")

'abc'

In [12]:
# Wyrażenia regularne w pythonie
import re
re.findall("[\w]+", "Titus Andronicus Roman-legion")

['Titus', 'Andronicus', 'Roman', 'legion']