In [61]:
from importlib import reload
import pandas as pd
import time
import TextPreprocessing as wpp
import CountryExtractor as ce
import SentimentAnalyser as sa
import TopicCategorizer as tc
import EntityAndSubjectExtractor as ese

Reading raw data

In [19]:
data = pd.read_csv('../data/new_data_vdss.tsv.xz', sep='\t', compression='xz')

In [20]:
# Datum bereinigen & redundante Spalten löschen

data['datum'] = data['pubtime'].str.extract(r'(\d{4}-\d{2}-\d{2})')
data['datum'] = pd.to_datetime(data['datum'])
del data['pubtime']
del data['medium_code']
del data['regional']
del data['doctype']
del data['language']
del data['char_count']
del data['dateline']
del data['subhead']
del data['content_id']

Text preprocessing for content

In [None]:
preprocessor = wpp.headlinePreprocessing()

start_time = time.time()
data['content_processed'] = data['content'].apply(lambda x: preprocessor.preprocess_text(x) if isinstance(x, str) else [])
end_time = time.time()

elapsed_time = end_time - start_time

del data['content']

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

Country analysis with running time estimation

In [23]:
extractor = ce.CountryExtractor()
%timeit extractor.get_country(data['content'][0])

68.1 µs ± 9.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


68.1 µs ± 9.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

In [None]:
# extraction of countries
extractor = ce.CountryExtractor()
data['countries'] = data['content_processed'].apply(lambda x: extractor.get_country(x) if isinstance(x[0], str) else [])

16.2 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

Sentiment analysis with running time estimation

In [31]:
reload(sa)
sentiment_analyser = sa.sentimentAnalyser()
%timeit sentiment_analyser.get_topic_sentiments_polarity(data['content_processed'][0])

65.9 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


12.8 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [33]:
# Sentiment analysis

sentiment_analyser = sa.sentimentAnalyser()

start_time = time.time()
data['Sentiment'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_sentiments_polarity(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 7932.10 seconds


Elapsed Time: 7932.10 seconds

Subjectivity analysis with running time estimation

In [73]:
reload(sa)
sentiment_analyser = sa.sentimentAnalyser()
%timeit sentiment_analyser.get_topic_subjectivity(processed_data['content_processed'][0])

16.4 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


16.4 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [75]:
sentiment_analyser = sa.sentimentAnalyser()

start_time = time.time()

data['objectivity'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_subjectivity(x))

end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data2.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 9175.09 seconds


Elapsed Time: 9175.09 seconds

Text preprocessing 'rubric' and running time estimation

In [144]:
text_processing = wpp.TextPreprocessing()
%timeit data['rubric'][:1].apply(lambda x: text_processing.preprocess_text(x.lower()))

330 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


330 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

In [150]:
data['topic'] = data['rubric'].apply(lambda x: text_processing.preprocess_text(x.lower()) if isinstance(x, str) else [])

Import of processed data

In [114]:
reload(tc)
categorizer = tc.TopicCategorizer()
%timeit processed_data['content_processed'][:1].apply(lambda x: categorizer.categorize(x))

14.4 ms ± 1.39 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


14 ms ± 899 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [38]:
processed_data = pd.read_csv('../data/processed_data3.tsv.xz', sep='\t', compression='xz')

In [40]:
# eval converts string to list
processed_data['content_processed'] = processed_data['content_processed'].apply(eval)

In [108]:
categorizer = tc.TopicCategorizer()

start_time = time.time()
processed_data['category'] = processed_data['content_processed'].apply(lambda x: categorizer.categorize(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

processed_data.to_csv('../data/processed_data5.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 6759.97 seconds


Elapsed Time: 6759.97 seconds

In [113]:
reload(ese)
entity_extractor = ese.EntityAndSubjectExtractor()
%timeit processed_data['head'][:1].apply(lambda x: entity_extractor.extract_entities(x))

4.85 ms ± 291 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [110]:
entity_extractor = ese.EntityAndSubjectExtractor()

start_time = time.time()
processed_data['entities'] = processed_data['head'].apply(lambda x: entity_extractor.extract_entities(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

processed_data.to_csv('../data/processed_data6.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 830.65 seconds


In [93]:
max = processed_data['Sentiment'].groupby(processed_data['datum']).median().max()
min = processed_data['Sentiment'].groupby(processed_data['datum']).median().min()
processed_data['Sentiment'].groupby(processed_data['datum']).median().apply(lambda x: 2*(x - min)/(max - min) - 1).plot()

0.075

In [119]:
del processed_data['id']
del processed_data['rubric']
del processed_data['sentiment']
del processed_data['kategorie']
del processed_data['doctype_description']

Unnamed: 0,medium_name,head,datum,content_processed,countries,Sentiment,objectivity,entities,category
0,blick.ch,Schweden beschliesst starke Leitzinserhöhung,2022-09-20,"[Die, schwedische, Notenbank, stemmt, kräftige...",['Schweden'],0.700000,0.000000,[Leitzinserhöhung],Wirtschaft
1,srf.ch,Handball-Nati: Schweizerinnen unterliegen Pole...,2022-04-24,"[Die, Schweizer, Handball, Nati, Frauen, verli...","['Ukraine', 'Polen', 'Russland', 'Slowenien', ...",-0.140000,0.000000,"[Schweizerinn, Quali-Ende]",Sport
2,swissinfo.ch,"""Wir befürchten, dass die Tabakinitiative die ...",2022-01-11,"[Wirtschaftskreise, wehren, starke, Einschränk...","['Frankreich', 'Schweiz', 'Italien']",0.150000,0.075000,"[Tabakinitiative, Tür, Werbeverbot]",Politik
3,NZZ am Sonntag,News,2022-08-28,"[Medizincannabis, Nikotin, gehen, Hand, Hand, ...",[],0.000000,0.000000,[News],Wissenschaft & Technik
4,srf.ch,Wenn Wassersportler fliegen lernen,2022-08-15,"[Man, sieht, bald, Schweizer, See, Wingfoiler,...","['Schweiz', 'Australien', 'USA']",0.194118,0.088235,[Wassersportler],Sport
...,...,...,...,...,...,...,...,...,...
153195,Berner Zeitung,Warum Opfer von Online-Hetze oft machtlos sind,2022-11-24,"[Am, Internet, Pranger, Ein, linksradikales, P...","['Schweiz', 'Irland', 'USA']",-0.432558,0.134884,"[Opfer, Online-Hetze]",Politik
153196,20 minuten,Bürocontainer in Vollbrand,2022-02-07,"[ARBON, Aus, unbekannten, Gründen, brannte, Na...",[],-0.850000,0.000000,[Bürocontainer],Regional
153197,blick.ch,Bundespräsident Cassis zu Besuch in Wien,2022-01-13,"[Die, nächsten, Schritte, Schweiz, Europa, Pol...","['Ukraine', 'Österreich', 'USA', 'Russland', '...",0.350000,0.000000,"[Bundespräsident, Besuch]",Politik
153198,NZZ am Sonntag,Classe politique,2022-02-13,"[Ignazio, Cassis, Globetrotter, twitterte, Bil...",['Niger'],0.000000,0.000000,[],Politik


In [122]:
processed_data = processed_data.rename(columns={'Sentiment': 'sentiment', 'datum': 'date', 'entities': 'entities_header', 'objectivity': 'subjectivity', 'category': 'article_category'})
processed_data

Unnamed: 0,medium_name,head,date,content_processed,countries,sentiment,subjectivity,entities_header,article_category
0,blick.ch,Schweden beschliesst starke Leitzinserhöhung,2022-09-20,"[Die, schwedische, Notenbank, stemmt, kräftige...",['Schweden'],0.700000,0.000000,[Leitzinserhöhung],Wirtschaft
1,srf.ch,Handball-Nati: Schweizerinnen unterliegen Pole...,2022-04-24,"[Die, Schweizer, Handball, Nati, Frauen, verli...","['Ukraine', 'Polen', 'Russland', 'Slowenien', ...",-0.140000,0.000000,"[Schweizerinn, Quali-Ende]",Sport
2,swissinfo.ch,"""Wir befürchten, dass die Tabakinitiative die ...",2022-01-11,"[Wirtschaftskreise, wehren, starke, Einschränk...","['Frankreich', 'Schweiz', 'Italien']",0.150000,0.075000,"[Tabakinitiative, Tür, Werbeverbot]",Politik
3,NZZ am Sonntag,News,2022-08-28,"[Medizincannabis, Nikotin, gehen, Hand, Hand, ...",[],0.000000,0.000000,[News],Wissenschaft & Technik
4,srf.ch,Wenn Wassersportler fliegen lernen,2022-08-15,"[Man, sieht, bald, Schweizer, See, Wingfoiler,...","['Schweiz', 'Australien', 'USA']",0.194118,0.088235,[Wassersportler],Sport
...,...,...,...,...,...,...,...,...,...
153195,Berner Zeitung,Warum Opfer von Online-Hetze oft machtlos sind,2022-11-24,"[Am, Internet, Pranger, Ein, linksradikales, P...","['Schweiz', 'Irland', 'USA']",-0.432558,0.134884,"[Opfer, Online-Hetze]",Politik
153196,20 minuten,Bürocontainer in Vollbrand,2022-02-07,"[ARBON, Aus, unbekannten, Gründen, brannte, Na...",[],-0.850000,0.000000,[Bürocontainer],Regional
153197,blick.ch,Bundespräsident Cassis zu Besuch in Wien,2022-01-13,"[Die, nächsten, Schritte, Schweiz, Europa, Pol...","['Ukraine', 'Österreich', 'USA', 'Russland', '...",0.350000,0.000000,"[Bundespräsident, Besuch]",Politik
153198,NZZ am Sonntag,Classe politique,2022-02-13,"[Ignazio, Cassis, Globetrotter, twitterte, Bil...",['Niger'],0.000000,0.000000,[],Politik


In [None]:
processed_data.to_csv('../data/processed_data_final.tsv.xz', sep='\t', index=False, compression='xz')