In [61]:
from importlib import reload
import pandas as pd
import time
import TextPreprocessing as wpp
import CountryExtractor as ce
import SentimentAnalyser as sa
import TopicCategorizer as tc
import EntityAndSubjectExtractor as ese

Reading raw data

In [19]:
data = pd.read_csv('../data/new_data_vdss.tsv.xz', sep='\t', compression='xz')

In [20]:
# Datum bereinigen & redundante Spalten löschen

data['date'] = data['pubtime'].str.extract(r'(\d{4}-\d{2}-\d{2})')
data['date'] = pd.to_datetime(data['date'])
del data['pubtime']
del data['medium_code']
del data['regional']
del data['doctype']
del data['language']
del data['char_count']
del data['dateline']
del data['subhead']
del data['content_id']
del data['id']
del data['rubric']
del data['doctype_description']

Text preprocessing for content

In [None]:
preprocessor = wpp.headlinePreprocessing()

start_time = time.time()
data['content_processed'] = data['content'].apply(lambda x: preprocessor.preprocess_text(x) if isinstance(x, str) else [])
end_time = time.time()

elapsed_time = end_time - start_time

del data['content']

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

Country analysis with running time estimation

In [None]:
extractor = ce.CountryExtractor()
%timeit extractor.get_country(data['content'][0])

68.1 µs ± 9.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

In [None]:
# extraction of countries
extractor = ce.CountryExtractor()
data['countries'] = data['content_processed'].apply(lambda x: extractor.get_country(x) if isinstance(x[0], str) else [])

16.2 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

Sentiment analysis with running time estimation

In [None]:
reload(sa)
sentiment_analyser = sa.sentimentAnalyser()
%timeit sentiment_analyser.get_topic_sentiments_polarity(data['content_processed'][0])

12.8 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
# Sentiment analysis

sentiment_analyser = sa.sentimentAnalyser()

start_time = time.time()
data['sentiment'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_sentiments_polarity(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 7932.10 seconds

Subjectivity analysis with running time estimation

In [None]:
reload(sa)
sentiment_analyser = sa.sentimentAnalyser()
%timeit sentiment_analyser.get_topic_subjectivity(data['content_processed'][0])

16.4 ms ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
sentiment_analyser = sa.sentimentAnalyser()

start_time = time.time()

data['subjectivity'] = data['content_processed'].apply(lambda x: sentiment_analyser.get_topic_subjectivity(x))

end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data2.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 9175.09 seconds

Running time estimation categorize content

In [None]:
reload(tc)
categorizer = tc.TopicCategorizer()
%timeit data['content_processed'][:1].apply(lambda x: categorizer.categorize(x))

14 ms ± 899 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
categorizer = tc.TopicCategorizer()

start_time = time.time()
data['article_category'] = data['content_processed'].apply(lambda x: categorizer.categorize(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data5.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 6759.97 seconds

Running time estimation extracting entities

In [None]:
reload(ese)
entity_extractor = ese.EntityAndSubjectExtractor()
%timeit processed_data['head'][:1].apply(lambda x: entity_extractor.extract_entities(x))

4.85 ms ± 291 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [None]:
entity_extractor = ese.EntityAndSubjectExtractor()

start_time = time.time()
data['entities_header'] = data['head'].apply(lambda x: entity_extractor.extract_entities(x))
end_time = time.time()

elapsed_time = end_time - start_time

print("Elapsed Time: {:.2f} seconds".format(elapsed_time))

data.to_csv('../data/processed_data_final.tsv.xz', sep='\t', index=False, compression='xz')

Elapsed Time: 830.65 seconds

Import processed data, using eval to convert content_processed and countries to list

In [126]:
proc_data = pd.read_csv('../data/processed_data_final.tsv.xz', sep='\t', compression='xz')
proc_data['content_processed'] = proc_data['content_processed'].apply(eval)
proc_data['countries'] = proc_data['countries'].apply(eval)

In [128]:
proc_data.head()

Unnamed: 0,medium_name,head,date,content_processed,countries,sentiment,subjectivity,entities_header,article_category
0,blick.ch,Schweden beschliesst starke Leitzinserhöhung,2022-09-20,"[Die, schwedische, Notenbank, stemmt, kräftige...",[Schweden],0.7,0.0,['Leitzinserhöhung'],Wirtschaft
1,srf.ch,Handball-Nati: Schweizerinnen unterliegen Pole...,2022-04-24,"[Die, Schweizer, Handball, Nati, Frauen, verli...","[Ukraine, Polen, Russland, Slowenien, Schweiz]",-0.14,0.0,"['Schweizerinn', 'Quali-Ende']",Sport
2,swissinfo.ch,"""Wir befürchten, dass die Tabakinitiative die ...",2022-01-11,"[Wirtschaftskreise, wehren, starke, Einschränk...","[Frankreich, Schweiz, Italien]",0.15,0.075,"['Tabakinitiative', 'Tür', 'Werbeverbot']",Politik
3,NZZ am Sonntag,News,2022-08-28,"[Medizincannabis, Nikotin, gehen, Hand, Hand, ...",[],0.0,0.0,['News'],Wissenschaft & Technik
4,srf.ch,Wenn Wassersportler fliegen lernen,2022-08-15,"[Man, sieht, bald, Schweizer, See, Wingfoiler,...","[Schweiz, Australien, USA]",0.194118,0.088235,['Wassersportler'],Sport
