In [1]:
from importlib import reload
import pandas as pd
import time
import TextPreprocessing as wpp
import CountryExtractor as ce
import SentimentAnalyser as sa
import TopicCategorizer as tc

Reading raw data

In [19]:
data = pd.read_csv('../data/new_data_vdss.tsv.xz', sep='\t', compression='xz')

In [20]:
# Datum bereinigen & redundante Spalten löschen

data['datum'] = data['pubtime'].str.extract(r'(\d{4}-\d{2}-\d{2})')
data['datum'] = pd.to_datetime(data['datum'])
del data['pubtime']
del data['medium_code']
del data['regional']
del data['doctype']
del data['language']
del data['char_count']
del data['dateline']
del data['subhead']
del data['content_id']

Text preprocessing for content

In [21]:
# Headlines preprocessing with class word_preprocessing

preprocessor = wpp.headlinePreprocessing()

start_time = time.time()
data['content_processed'] = data['content'].apply(lambda x: preprocessor.preprocess_text(x) if isinstance(x, str) else [])
end_time = time.time()

elapsed_time = end_time - start_time

del data['content']

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

KeyboardInterrupt: 

In [22]:
# preparing blick.ch rubrics
data.loc[data['medium_name'] == 'blick.ch', 'rubric'] = data.loc[data['medium_name'] == 'blick.ch', 'rubric'].str.split(' - ').str[0]
filtered_data = data[data['medium_name'] == 'blick.ch']['rubric']
data

Unnamed: 0,id,medium_name,rubric,doctype_description,head,content,datum
0,47612274,blick.ch,Wirtschaft,Online medium,Schweden beschliesst starke Leitzinserhöhung,<tx><ld><p>Die schwedische Notenbank stemmt si...,2022-09-20
1,46153316,srf.ch,Sport - Mehr Sport,Online medium,Handball-Nati: Schweizerinnen unterliegen Pole...,<tx><ld><p>Die Schweizer Handball-Nati der Fra...,2022-04-24
2,45099545,swissinfo.ch,Gesundheit,Online medium,"""Wir befürchten, dass die Tabakinitiative die ...",<tx><ld><p>Wirtschaftskreise wehren sich gegen...,2022-01-11
3,47381556,NZZ am Sonntag,Mensch und Medizin,Regional weekly newspaper,News,<tx><zt>Medizincannabis und Nikotin gehen Hand...,2022-08-28
4,47256809,srf.ch,Radio SRF 1,Online medium,Wenn Wassersportler fliegen lernen,<tx><ld><p>Man sieht sie bald auf jedem Schwei...,2022-08-15
...,...,...,...,...,...,...,...
153195,48273157,Berner Zeitung,Region,Regional daily newspaper,Warum Opfer von Online-Hetze oft machtlos sind,<tx><ld><p>Am Internet-Pranger Ein linksradika...,2022-11-24
153196,45361730,20 minuten,St. Gallen/Region,Regional daily newspaper,Bürocontainer in Vollbrand,<tx><p>ARBON. Aus noch unbekannten Gründen bra...,2022-02-07
153197,45122731,blick.ch,Politik,Online medium,Bundespräsident Cassis zu Besuch in Wien,<tx><ld><p>Die nächsten Schritte der Schweiz i...,2022-01-13
153198,45429217,NZZ am Sonntag,Schweiz,Regional weekly newspaper,Classe politique,"<tx><p>Ignazio Cassis, Globetrotter, twitterte...",2022-02-13


Country analysis with running time estimation

In [23]:
extractor = ce.CountryExtractor()
%timeit extractor.get_country(data['content'][0])

68.1 µs ± 9.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [25]:
# extraction of countries
extractor = ce.CountryExtractor()
data['countries'] = data['content_processed'].apply(lambda x: extractor.get_country(x) if isinstance(x[0], str) else [])
data

Unnamed: 0,id,medium_name,rubric,doctype_description,head,datum,content_processed,countries
0,47612274,blick.ch,Wirtschaft,Online medium,Schweden beschliesst starke Leitzinserhöhung,2022-09-20,"[Die, schwedische, Notenbank, stemmt, kräftige...",[Schweden]
1,46153316,srf.ch,Sport - Mehr Sport,Online medium,Handball-Nati: Schweizerinnen unterliegen Pole...,2022-04-24,"[Die, Schweizer, Handball, Nati, Frauen, verli...","[Ukraine, Polen, Russland, Slowenien, Schweiz]"
2,45099545,swissinfo.ch,Gesundheit,Online medium,"""Wir befürchten, dass die Tabakinitiative die ...",2022-01-11,"[Wirtschaftskreise, wehren, starke, Einschränk...","[Frankreich, Schweiz, Italien]"
3,47381556,NZZ am Sonntag,Mensch und Medizin,Regional weekly newspaper,News,2022-08-28,"[Medizincannabis, Nikotin, gehen, Hand, Hand, ...",[]
4,47256809,srf.ch,Radio SRF 1,Online medium,Wenn Wassersportler fliegen lernen,2022-08-15,"[Man, sieht, bald, Schweizer, See, Wingfoiler,...","[Schweiz, Australien, USA]"
...,...,...,...,...,...,...,...,...
153195,48273157,Berner Zeitung,Region,Regional daily newspaper,Warum Opfer von Online-Hetze oft machtlos sind,2022-11-24,"[Am, Internet, Pranger, Ein, linksradikales, P...","[Schweiz, Irland, USA]"
153196,45361730,20 minuten,St. Gallen/Region,Regional daily newspaper,Bürocontainer in Vollbrand,2022-02-07,"[ARBON, Aus, unbekannten, Gründen, brannte, Na...",[]
153197,45122731,blick.ch,Politik,Online medium,Bundespräsident Cassis zu Besuch in Wien,2022-01-13,"[Die, nächsten, Schritte, Schweiz, Europa, Pol...","[Ukraine, Österreich, USA, Russland, Schweiz]"
153198,45429217,NZZ am Sonntag,Schweiz,Regional weekly newspaper,Classe politique,2022-02-13,"[Ignazio, Cassis, Globetrotter, twitterte, Bil...",[Niger]


16.2 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

Sentiment analysis with running time estimation

In [31]:
reload(sa)
sentiment_analyser = sa.sentimentAnalyser()
%timeit sentiment_analyser.get_topic_sentiments_polarity(data['content_processed'][0])

65.9 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


12.8 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [33]:
# Sentiment analysis

sentiment_analyser = sa.sentimentAnalyser()

start_time = time.time()
data['Sentiment'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_sentiments_polarity(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 7932.10 seconds


Elapsed Time: 7932.10 seconds

Subjectivity analysis with running time estimation

In [73]:
reload(sa)
sentiment_analyser = sa.sentimentAnalyser()
%timeit sentiment_analyser.get_topic_subjectivity(processed_data['content_processed'][0])

16.4 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [75]:
sentiment_analyser = sa.sentimentAnalyser()

start_time = time.time()

data['objectivity'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_subjectivity(x))

end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data2.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 9175.09 seconds


Elapsed Time: 9175.09 seconds

Text preprocessing 'rubric' and running time estimation

In [144]:
text_processing = wpp.TextPreprocessing()
%timeit data['rubric'][:1].apply(lambda x: text_processing.preprocess_text(x.lower()))

330 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


330 µs ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

In [150]:
data['topic'] = data['rubric'].apply(lambda x: text_processing.preprocess_text(x.lower()) if isinstance(x, str) else [])

Import of processed data

In [38]:
processed_data = pd.read_csv('../data/processed_data2.tsv.xz', sep='\t', compression='xz')

In [40]:
# eval converts string to list
processed_data['content_processed'] = processed_data['content_processed'].apply(eval)

In [46]:
import TopicCategorizer as tc
reload(tc)
%timeit processed_data['content_processed'][:1].apply(lambda x: categorizer.categorize(x))

14 ms ± 899 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
categorizer = tc.TopicCategorizer()

start_time = time.time()
processed_data['kategorie'] = processed_data['content_processed'].apply(lambda x: categorizer.categorize(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

processed_data.to_csv('../data/processed_data3.tsv.xz', sep='\t', index=False, compression='xz')

In [45]:
processed_data

Unnamed: 0,id,medium_name,rubric,doctype_description,head,datum,content_processed,countries,Sentiment,objectivity,kategorie
0,47612274,blick.ch,Wirtschaft,Online medium,Schweden beschliesst starke Leitzinserhöhung,2022-09-20,"[Die, schwedische, Notenbank, stemmt, kräftige...",['Schweden'],0.700000,0.000000,Wirtschaft
1,46153316,srf.ch,Sport - Mehr Sport,Online medium,Handball-Nati: Schweizerinnen unterliegen Pole...,2022-04-24,"[Die, Schweizer, Handball, Nati, Frauen, verli...","['Ukraine', 'Polen', 'Russland', 'Slowenien', ...",-0.140000,0.000000,Sport
2,45099545,swissinfo.ch,Gesundheit,Online medium,"""Wir befürchten, dass die Tabakinitiative die ...",2022-01-11,"[Wirtschaftskreise, wehren, starke, Einschränk...","['Frankreich', 'Schweiz', 'Italien']",0.150000,0.075000,Politik
3,47381556,NZZ am Sonntag,Mensch und Medizin,Regional weekly newspaper,News,2022-08-28,"[Medizincannabis, Nikotin, gehen, Hand, Hand, ...",[],0.000000,0.000000,Wissenschaft & Technik
4,47256809,srf.ch,Radio SRF 1,Online medium,Wenn Wassersportler fliegen lernen,2022-08-15,"[Man, sieht, bald, Schweizer, See, Wingfoiler,...","['Schweiz', 'Australien', 'USA']",0.194118,0.088235,Sport
...,...,...,...,...,...,...,...,...,...,...,...
153195,48273157,Berner Zeitung,Region,Regional daily newspaper,Warum Opfer von Online-Hetze oft machtlos sind,2022-11-24,"[Am, Internet, Pranger, Ein, linksradikales, P...","['Schweiz', 'Irland', 'USA']",-0.432558,0.134884,
153196,45361730,20 minuten,St. Gallen/Region,Regional daily newspaper,Bürocontainer in Vollbrand,2022-02-07,"[ARBON, Aus, unbekannten, Gründen, brannte, Na...",[],-0.850000,0.000000,
153197,45122731,blick.ch,Politik,Online medium,Bundespräsident Cassis zu Besuch in Wien,2022-01-13,"[Die, nächsten, Schritte, Schweiz, Europa, Pol...","['Ukraine', 'Österreich', 'USA', 'Russland', '...",0.350000,0.000000,
153198,45429217,NZZ am Sonntag,Schweiz,Regional weekly newspaper,Classe politique,2022-02-13,"[Ignazio, Cassis, Globetrotter, twitterte, Bil...",['Niger'],0.000000,0.000000,
