# Group 9 - Refugees/Foreigner in Media 

Alexander Schneider, Simon Schauß, Lukas Härtel

## 1 Initialization

### 1.1 Initialize spark session

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession


SparkContext.setSystemProperty('spark.driver.memory', '4g')
SparkContext.setSystemProperty('spark.driver.maxResultSize', '4g')
SparkContext.setSystemProperty('spark.executor.memory', '4g')

sample_fraction = 1
spark = SparkSession.builder \
    .appName('reffor') \
    .master('local[*]') \
    .getOrCreate()

### 1.2 Initialize tokenizer

In [2]:
from nltk.tokenize.punkt import PunktSentenceTokenizer
from os.path import join
from pickle import load


german_tokenizer = PunktSentenceTokenizer(load(open(join('german', 'punkt-de'), 'rb')))


### 1.3 Load sentiment files

In [3]:
from sentiment_loader import load_sentiment_file


pos_sentiment = load_sentiment_file(join('german', 'SentiWS_v1.8c_Positive.txt'))
neg_sentiment = load_sentiment_file(join('german', 'SentiWS_v1.8c_Negative.txt'))


## 2. Analyzation

### 2.1 Tag words in articles with sentiment polarity

The polarity ranges from -1 (negative sentiment) to +1 (positive sentiment)

In [4]:
from common import flatten
from dateutil.parser import parse as parse_date
from pyspark.sql import Row
from pattern.text.de import parse as tag_polarity
from math import ceil
from pyspark.storagelevel import StorageLevel


def tag_sentence(sentence):
    """
    Tag words in sentence with the STTS.
    
    :param sentence: the sentence to tag as array of words 
    :return: the sentence as array of word/tag tuples
    """
    return [(word_tag[0], word_tag[1]) for word_tag in
            flatten(tag_polarity(sentence, tagset='STTS', chunks=False, split=True))]


def evaluate_polarity(word, tag):
    """
    Evaluate the polarity for a word with given STTS tag.
    
    :param word: the word to evaluate
    :param tag: the STTS tag for the word
    :return: tuple of word and polarity
    """
    word_lower = word.lower()
    pos_polarity = pos_sentiment.get((word_lower, tag), 0.0)
    return word, neg_sentiment.get((word_lower, tag), pos_polarity)


def evaluate_sentiment(article):
    """
    Evaluate the polarity for every word in an article.
    
    :param article: the article to evaluate
    :return: the article as array of word/polarity tuples
    """
    tagged_article = flatten([tag_sentence(sentence) for sentence in german_tokenizer.tokenize(article)])
    return [evaluate_polarity(word, tag) for (word, tag) in tagged_article]


# read articles from disk and evaluate the sentiment polarity for every word in the article
tagged_articles_cache = spark \
    .read \
    .csv('spiegel-articles-csv', header=True) \
    .rdd \
    .sample(withReplacement=False, fraction=sample_fraction) \
    .repartition(ceil(512 * sample_fraction)) \
    .map(lambda row: Row(date=parse_date(row.date), article=row.article, sentiments=evaluate_sentiment(row.article))) \
    .sortBy(lambda row: row.date) \
    .persist()


### 2.2 Calculate sentiment polarity sum per article

In [5]:
# map the array of sentiment values for the words in an article to the average polarity of these words
tagged_articles = tagged_articles_cache \
    .map(lambda row: Row(date=row.date, article=row.article, sentiment_mean=sum([t[1] for t in row.sentiments]) / len(row.sentiments))) \
    .toDF() \
    .toPandas()


### 2.3 Plot moving average of sentiment overtime

In [13]:
from pandas.tseries.converter import DatetimeConverter
from matplotlib.dates import DateFormatter
import matplotlib.pyplot as plt
import ipywidgets as widgets


@widgets.interact(search_term="")
def render_fig(search_term):
    window_size = '180d'
    min_periods = 30
    xlabel = 'date'
    ylabel = 'sentiment'
    title = 'Moving Average (window size {}, min. periods {}) Sentiment for Spiegel' \
        .format(window_size, min_periods)
    
    if(search_term is not ''):
        title += ' Online articles covering {}'.format(search_term)

    filtered_articles = tagged_articles[
        tagged_articles.apply(lambda row: search_term.lower() in row['article'].lower(), axis=1)]
    
    if(len(filtered_articles) is not 0):
        rolled_tagged_articles = filtered_articles.rolling(window_size, min_periods, closed='right', on='date')[
            'sentiment_mean']
        mean_sentiments = rolled_tagged_articles.mean().values
        bottom_quantile_sentiments = rolled_tagged_articles.quantile(.25).values
        top_quantile_sentiments = rolled_tagged_articles.quantile(.75).values

        dates = DatetimeConverter.convert(filtered_articles['date'].values, None, None)

        fig, ax = plt.subplots(figsize=(25, 10))

        ax.xaxis.set_major_formatter(DateFormatter('%Y'))

        ax.plot(dates, mean_sentiments)
        ax.set(xlabel='date', ylabel='sentiment', title=title)
        ax.grid()
        ax.fill_between(dates, bottom_quantile_sentiments, top_quantile_sentiments, facecolor='green', alpha=0.2)

        plt.rcParams.update({'font.size': 18})
        plt.show()

interactive(children=(Text(value='', description='search_term'), Output()), _dom_classes=('widget-interact',))

### 2.4 List of the most common words with positive polarity

In [9]:
tagged_articles_cache \
    .flatMap(lambda row: [Row(word=t[0], sentiment=t[1]) for t in row.sentiments]) \
    .filter(lambda row: row.sentiment > 0) \
    .toDF() \
    .groupBy('word') \
    .count() \
    .orderBy('count', ascending=False) \
    .toPandas() \
    .head(10)


Unnamed: 0,word,count
0,neue,10580
1,neuen,8721
2,gut,8553
3,große,5442
4,großen,5199
5,Hilfe,4171
6,klar,3972
7,schnell,3717
8,möglich,3461
9,Sicherheit,3270


### 2.5 List of the most common words with negative polarity

In [10]:
tagged_articles_cache \
    .flatMap(lambda row: [Row(word=t[0], sentiment=t[1]) for t in row.sentiments]) \
    .filter(lambda row: row.sentiment < 0) \
    .toDF() \
    .groupBy('word') \
    .count() \
    .orderBy('count', ascending=False) \
    .toPandas() \
    .head(10)


Unnamed: 0,word,count
0,Flüchtlinge,28339
1,Ende,10635
2,Flüchtlingen,7912
3,Krieg,4952
4,Gewalt,4668
5,Kritik,4203
6,Angst,4146
7,Problem,4107
8,Flucht,3797
9,Kampf,3617
