In [None]:
from importlib import reload
import pandas as pd
import time
import text_preprocessing as wpp
import country_extractor as ce
import sentiment_analyser as sa
import topic_categorizer as tc
import entity_subject_extractor as ese

Reading raw data

In [None]:
data = pd.read_csv('../data/new_data_vdss.tsv.xz', sep='\t', compression='xz')

Delete redundant columns

In [None]:
data['date'] = data['pubtime'].str.extract(r'(\d{4}-\d{2}-\d{2})')
data['date'] = pd.to_datetime(data['date'])
del data['pubtime']
del data['medium_code']
del data['regional']
del data['doctype']
del data['language']
del data['char_count']
del data['dateline']
del data['subhead']
del data['content_id']
del data['id']
del data['rubric']
del data['doctype_description']

Text preprocessing for article content

In [None]:
preprocessor = wpp.TextPreprocessing()

start_time = time.time()
data['content_processed'] = data['content'].apply(lambda x: preprocessor.preprocess_text(x) if isinstance(x, str) else [])
end_time = time.time()

elapsed_time = end_time - start_time

del data['content']

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

Country extraction with runtime estimation

In [None]:
extractor = ce.CountryExtractor()
%timeit extractor.get_country(data['content'][0])

68.1 µs ± 9.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

In [None]:
extractor = ce.CountryExtractor()
data['countries'] = data['content_processed'].apply(lambda x: extractor.get_country(x) if isinstance(x[0], str) else [])

Sentiment analysis with runtime estimation

In [None]:
reload(sa)
sentiment_analyser = sa.SentimentAnalyser()
%timeit sentiment_analyser.get_topic_sentiments_polarity(data['content_processed'][0])

12.8 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
sentiment_analyser = sa.SentimentAnalyser()

start_time = time.time()
data['sentiment'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_sentiments_polarity(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 7932.10 seconds

Subjectivity analysis with runtime estimation

In [None]:
reload(sa)
sentiment_analyser = sa.SentimentAnalyser()
%timeit sentiment_analyser.get_topic_subjectivity(data['content_processed'][0])

16.4 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
sentiment_analyser = sa.SentimentAnalyser()

start_time = time.time()
data['subjectivity'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_subjectivity(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data2.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 9175.09 seconds

Categorize content with runtime estimation

In [None]:
reload(tc)
categorizer = tc.TopicCategorizer()
%timeit data['content_processed'][:1].apply(lambda x: categorizer.categorize(x))

14 ms ± 899 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
categorizer = tc.TopicCategorizer()

start_time = time.time()
data['article_category'] = data['content_processed'].apply(lambda x: categorizer.categorize(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data5.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 6759.97 seconds

Extracting entities runtime estimation

In [None]:
reload(ese)
entity_extractor = ese.EntityAndSubjectExtractor()
%timeit processed_data['head'][:1].apply(lambda x: entity_extractor.extract_entities(x))

4.85 ms ± 291 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
entity_extractor = ese.EntityAndSubjectExtractor()

start_time = time.time()
data['entities_header'] = data['head'].apply(lambda x: entity_extractor.extract_entities(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data_final.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 830.65 seconds

In [None]:
translate_countries = ce.CountryExtractor()
data['countries_en'] = data['countries'].apply(lambda x: translate_countries.country_translation(x))

In [None]:
data.to_csv('../data/processed_data_final.tsv.xz', sep='\t', index=False, compression='xz')

Import processed data, using eval to convert content_processed and countries to list

In [None]:
processed_data = pd.read_csv('../data/processed_data_final.tsv.xz', sep='\t', compression='xz')
processed_data['content_processed'] = processed_data['content_processed'].apply(eval)
#proc_data['countries'] = proc_data['countries'].apply(eval)

In [None]:
# Aufbereitung der Liste mit Persönlichkeiten

import re

names = []
with open("../data/persoenlichkeiten_raw.txt", 'r', encoding='UTF-8') as file:
    for line in file:
        names.append(re.sub(r'^\d+\.\s', '', line.strip()))

names = set(names)
names_df = pd.DataFrame(names)
names_df.to_csv("../data/persoenlichkeiten.csv")

In [None]:
reload(ese)
people_extractor = ese.EntityAndSubjectExtractor()
start_time = time.time()
processed_data['people'] = processed_data['content_processed'].apply(people_extractor.extract_people)
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

processed_data.to_csv('../data/processed_data_final_add_people.tsv.xz', sep='\t', index=False, compression='xz')

In [None]:
del processed_data['content_processed']
processed_data.to_csv('../data/without_content.tsv.xz', sep='\t', index=False, compression='xz')

In [32]:
df = pd.read_csv('../data/without_content.tsv.xz', sep='\t', compression='xz')
df['countries'] = df['countries'].apply(eval)

In [None]:
df['entities_header'] = df['entities_header'].apply(eval)

In [107]:
import itertools
import pandas as pd
import holoviews as hv
from holoviews import opts, dim, Dataset

class ChordCharts:

    def __init__(self, data, specific_countries=None):
        self.data = data.tolist()
        self.edges_list = []
        if specific_countries != 'Alle':
            self.specific_countries = specific_countries
        else:
            self.specific_countries = None

    def create_edges(self):
        for connection in self.data:
            for pair in itertools.combinations(connection, 2):
                if self.specific_countries:  # If specific countries have been specified
                    if any(country in pair for country in self.specific_countries):  # Add pairs that contain any of the specific countries
                        self.edges_list.append(pair)
                else:  # If no specific countries have been specified, add all pairs
                    self.edges_list.append(pair)
        edges_df = pd.DataFrame(self.edges_list, columns=['source', 'target'])
        edges_ds = Dataset(edges_df, ['source', 'target'])
        return edges_ds

    def country_chord_chart(self):
        hv.extension('bokeh')
        chord = hv.Chord(self.create_edges()).select(value=(1, None))
        return chord.opts(
            opts.Chord(
                cmap='Category20',
                edge_cmap='Category20',
                edge_color=dim('source').str(),
                labels='index',
                node_color=dim('index').str(),
                width=650,
                height=650,
                tools=['tap']
            )
        )


In [122]:
def contains_country(country_list, countries):
    for country in countries:
        if country in country_list:
            return True
    return False

test = ChordCharts(df[df['countries'].apply(lambda x: contains_country(country_list=x, countries=['Ukraine', 'Russland']))]['countries'][:50], specific_countries=['Ukraine', 'Russland'])
test.create_edges()

test.edges_list
#test.country_chord_chart()

[('Ukraine', 'Slowenien'),
 ('Ukraine', 'Polen'),
 ('Ukraine', 'Schweiz'),
 ('Ukraine', 'Russland'),
 ('Slowenien', 'Russland'),
 ('Polen', 'Russland'),
 ('Schweiz', 'Russland'),
 ('Moldawien', 'Ukraine'),
 ('Moldawien', 'Russland'),
 ('USA', 'Ukraine'),
 ('USA', 'Russland'),
 ('Ukraine', 'Ungarn'),
 ('Ukraine', 'Rumänien'),
 ('Ukraine', 'Polen'),
 ('Ukraine', 'Schweiz'),
 ('Ukraine', 'Frankreich'),
 ('Ukraine', 'Russland'),
 ('Ungarn', 'Russland'),
 ('Rumänien', 'Russland'),
 ('Polen', 'Russland'),
 ('Schweiz', 'Russland'),
 ('Frankreich', 'Russland'),
 ('Russland', 'USA'),
 ('Russland', 'Schweiz'),
 ('Russland', 'Ukraine'),
 ('USA', 'Ukraine'),
 ('Schweiz', 'Ukraine'),
 ('Malta', 'Ukraine'),
 ('Malta', 'Russland'),
 ('Ukraine', 'USA'),
 ('Ukraine', 'Niederlande'),
 ('Ukraine', 'Polen'),
 ('Ukraine', 'Deutschland'),
 ('Ukraine', 'Schweiz'),
 ('Ukraine', 'Russland'),
 ('USA', 'Russland'),
 ('Niederlande', 'Russland'),
 ('Polen', 'Russland'),
 ('Deutschland', 'Russland'),
 ('Schweiz', '