In [None]:
from importlib import reload
import pandas as pd
import time
import text_preprocessing as wpp
import country_extractor as ce
import sentiment_analyser as sa
import topic_categorizer as tc
import entity_subject_extractor as ese

Reading raw data

In [None]:
data = pd.read_csv('../data/new_data_vdss.tsv.xz', sep='\t', compression='xz')

Delete redundant columns

In [None]:
data['date'] = data['pubtime'].str.extract(r'(\d{4}-\d{2}-\d{2})')
data['date'] = pd.to_datetime(data['date'])
del data['pubtime']
del data['medium_code']
del data['regional']
del data['doctype']
del data['language']
del data['char_count']
del data['dateline']
del data['subhead']
del data['content_id']
del data['id']
del data['rubric']
del data['doctype_description']

Text preprocessing for article content

In [None]:
preprocessor = wpp.TextPreprocessing()

start_time = time.time()
data['content_processed'] = data['content'].apply(lambda x: preprocessor.preprocess_text(x) if isinstance(x, str) else [])
end_time = time.time()

elapsed_time = end_time - start_time

del data['content']

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

Country extraction with runtime estimation

In [None]:
extractor = ce.CountryExtractor()
%timeit extractor.get_country(data['content'][0])

68.1 µs ± 9.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

In [None]:
extractor = ce.CountryExtractor()
data['countries'] = data['content_processed'].apply(lambda x: extractor.get_country(x) if isinstance(x[0], str) else [])

Sentiment analysis with runtime estimation

In [None]:
reload(sa)
sentiment_analyser = sa.SentimentAnalyser()
%timeit sentiment_analyser.get_topic_sentiments_polarity(data['content_processed'][0])

12.8 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
sentiment_analyser = sa.SentimentAnalyser()

start_time = time.time()
data['sentiment'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_sentiments_polarity(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 7932.10 seconds

Subjectivity analysis with runtime estimation

In [None]:
reload(sa)
sentiment_analyser = sa.SentimentAnalyser()
%timeit sentiment_analyser.get_topic_subjectivity(data['content_processed'][0])

16.4 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
sentiment_analyser = sa.SentimentAnalyser()

start_time = time.time()
data['subjectivity'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_subjectivity(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data2.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 9175.09 seconds

Categorize content with runtime estimation

In [None]:
reload(tc)
categorizer = tc.TopicCategorizer()
%timeit data['content_processed'][:1].apply(lambda x: categorizer.categorize(x))

14 ms ± 899 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
categorizer = tc.TopicCategorizer()

start_time = time.time()
data['article_category'] = data['content_processed'].apply(lambda x: categorizer.categorize(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data5.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 6759.97 seconds

Extracting entities runtime estimation

In [None]:
reload(ese)
entity_extractor = ese.EntityAndSubjectExtractor()
%timeit processed_data['head'][:1].apply(lambda x: entity_extractor.extract_entities(x))

4.85 ms ± 291 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
entity_extractor = ese.EntityAndSubjectExtractor()

start_time = time.time()
data['entities_header'] = data['head'].apply(lambda x: entity_extractor.extract_entities(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data_final.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 830.65 seconds

In [None]:
translate_countries = ce.CountryExtractor()
data['countries_en'] = data['countries'].apply(lambda x: translate_countries.country_translation(x))

In [None]:
data.to_csv('../data/processed_data_final.tsv.xz', sep='\t', index=False, compression='xz')

Import processed data, using eval to convert content_processed and countries to list

In [None]:
processed_data = pd.read_csv('../data/processed_data_final.tsv.xz', sep='\t', compression='xz')
processed_data['content_processed'] = processed_data['content_processed'].apply(eval)
#proc_data['countries'] = proc_data['countries'].apply(eval)

In [None]:
# Aufbereitung der Liste mit Persönlichkeiten

import re

names = []
with open("../data/persoenlichkeiten_raw.txt", 'r', encoding='UTF-8') as file:
    for line in file:
        names.append(re.sub(r'^\d+\.\s', '', line.strip()))

names = set(names)
names_df = pd.DataFrame(names)
names_df.to_csv("../data/persoenlichkeiten.csv")

In [None]:
reload(ese)
people_extractor = ese.EntityAndSubjectExtractor()
start_time = time.time()
processed_data['people'] = processed_data['content_processed'].apply(people_extractor.extract_people)
end_time = time.time()
elapsed_time = end_time - start_time
print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

processed_data.to_csv('../data/processed_data_final_add_people.tsv.xz', sep='\t', index=False, compression='xz')

In [None]:
del processed_data['content_processed']
processed_data.to_csv('../data/without_content.tsv.xz', sep='\t', index=False, compression='xz')

In [32]:
df = pd.read_csv('../data/without_content.tsv.xz', sep='\t', compression='xz')
df['countries'] = df['countries'].apply(eval)

In [None]:
df['entities_header'] = df['entities_header'].apply(eval)

In [45]:
def contains_country(country_list, country='Deutschland'):
    if country in country_list:
        return True
    else:
        return False

df[df['countries'].apply(contains_country)]

Unnamed: 0,medium_name,head,date,countries,sentiment,subjectivity,entities_header,article_category,countries_en,people
8,blick.ch,EU beschliesst Gaspreisdeckel: Einigung auf Ma...,2022-12-19,[Deutschland],-0.850000,0.000000,"['Gaspreisdeckel', 'Einigung', 'Markteingriff']",Politik,['Germany'],[]
13,blick.ch,Conference League: Adam Szalai ist in Ungarn K...,2022-10-06,"[Ungarn, Italien, Deutschland]",0.528571,0.142857,"['Adam', 'Szalai', 'Bettler']",Sport,"['Hungary', 'Italy', 'Germany']",[]
14,srf.ch,US-Geheimdienste rechnen nicht mit schnellem K...,2022-05-10,"[Malta, Ukraine, USA, Niederlande, Polen, Deut...",-0.133824,0.202941,"['US-Geheimdienste', 'Kriegsend']",Politik,"['Malta', 'Ukraine', 'USA', 'Netherlands', 'Po...","['Emmanuel Macron', 'Andrij Jermak', 'Olaf Sch..."
26,blick.ch,EU-Aussenbeauftragter ist optimistisch,2022-09-01,"[USA, China, Deutschland, Frankreich, Russland...",0.540000,0.000000,['EU-Aussenbeauftragter'],Politik,"['USA', 'China', 'Germany', 'France', 'Russia'...","['Joe Biden', 'Donald Trump']"
28,Tages-Anzeiger,«Gut gegen Böse. So einfach ist das»,2022-04-30,"[Georgien, Russland, Deutschland, Ukraine]",0.332143,0.096429,['böse'],Politik,"['Georgia', 'Russia', 'Germany', 'Ukraine']",[]
...,...,...,...,...,...,...,...,...,...,...
153121,blick.ch,Bachelor 2022: Alle Informationen zu Kandidati...,2022-09-13,[Deutschland],0.323077,0.076923,"['Bachelor', 'Information', 'Kandidatin']",Sport,['Germany'],[]
153137,srf.ch,Behörden sichern Basler Friedhof wegen Gedenkt...,2022-05-09,"[Ukraine, Russland, Deutschland, Schweiz]",0.525000,0.250000,"['Behörde', 'Friedhof', 'Gedenktag']",Regional,"['Ukraine', 'Russia', 'Germany', 'Switzerland']",[]
153153,srf.ch,Deutschland hadert und setzt alles auf Trumpf ...,2022-07-21,"[Deutschland, USA]",0.083333,0.083333,['Mihambo'],Sport,"['Germany', 'USA']",[]
153170,srf.ch,Offenbar rund 200 Leichen in Keller in Mariupo...,2022-05-25,"[Ukraine, Ungarn, Polen, Deutschland, Schweiz,...",0.086364,0.100000,['Leiche'],Politik,"['Ukraine', 'Hungary', 'Poland', 'Germany', 'S...","['Ignazio Cassis', 'Nikolai Patruschew', 'Anto..."
