### Import libraries

In [1]:
import pandas as pd
import numpy as np
import os

### Import dataset

In [2]:
file_path = os.path.join("../dataset", "twitter_URLs.bin")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,expanded_url,domain,tag
0,https://twitter.com/kjh_mov/status/12996294579...,twitter.com,
1,https://twitter.com/kjh_mov/status/12996294579...,twitter.com,
2,https://twitter.com/kjh_mov/status/12996294579...,twitter.com,
3,http://ntv.de,ntv.de,
4,https://twitter.com/kjh_mov/status/12996294506...,twitter.com,


# 1. Data Understanding

In [3]:
df.describe()

Unnamed: 0,expanded_url,domain,tag
count,401411,401411,27904
unique,286502,9948,9
top,https://support.twitter.com/articles/20169199,twitter.com,Youtube
freq,623,288363,25311


In [4]:
df.dtypes

expanded_url    object
domain          object
tag             object
dtype: object

In [5]:
df["domain"].value_counts()

domain
twitter.com              288363
youtu.be                  14964
youtube.com               10339
welt.de                    1836
tagesspiegel.de            1804
                          ...  
comicsands.com                1
morawa.at                     1
abbott.mediaroom.com          1
stadtkindfrankfurt.de         1
islam.de                      1
Name: count, Length: 9948, dtype: int64

In [6]:
#df.profile_report()

# 2. Data Processing

### 2.1 Extract news links

In [7]:
df = df.loc[(df.domain == "welt.de") | (df.domain == "tagesspiegel.de") | (df.domain == "tagesschau.de") | (df.domain == "spiegel.de") | (df.domain == "zeit.de")]
df.describe()

Unnamed: 0,expanded_url,domain,tag
count,8101,8101,0.0
unique,3804,5,0.0
top,https://www.tagesspiegel.de/themen/reportage/q...,welt.de,
freq,131,1836,


### 2.2 drop duplicate rows and reset index

In [8]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,expanded_url,domain,tag
count,3804,3804,0.0
unique,3804,5,0.0
top,https://www.welt.de/politik/deutschland/articl...,spiegel.de,
freq,1,848,


### 2.3 extract title, text, summary and keywords

In [9]:
# !pip3 install newspaper3k

In [10]:
from newspaper import Article

#create new columns for article title, text, summary and keywords
df["title"] = np.nan
df["text"] = np.nan
df["summary"] = np.nan
df["keywords"] = np.nan

#extract title, text, summary and keywords for each article and copy to corresponding row-column
url_list = df.expanded_url
for i in range(0,len(url_list)):
    print("Article number: "+str(i))
    try:
        article = Article(url_list[i], language="de")
        article.download()
        article.parse()
        article.nlp()
    
        df["title"][i] = article.title
        df["text"][i] = article.text
        df["summary"][i] = article.summary
        df["keywords"][i] = article.keywords
    except Exception as e:
        print(e)

Article number: 0


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["title"][i] = article.title
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"][i] = article.title
 

Article number: 1
Article number: 2
Article number: 3
Article number: 4
Article number: 5
Article number: 6
Article number: 7
Article number: 8
Article number: 9
Article number: 10
Article number: 11
Article number: 12
Article number: 13
Article number: 14
Article number: 15
Article number: 16
Article number: 17
Article number: 18
Article number: 19
Article number: 20
Article number: 21
Article number: 22
Article number: 23
Article number: 24
Article number: 25
Article `download()` failed with 404 Client Error: Not Found for url: https://www.tagesschau.de/ausland/corona--infektionen-frankreich-101.html on URL http://www.tagesschau.de/ausland/corona--infektionen-frankreich-101.html
Article number: 26
Article number: 27
Article number: 28
Article number: 29
Article number: 30
Article number: 31
Article number: 32
Article number: 33
Article number: 34
Article number: 35
Article number: 36
Article number: 37
Article number: 38
Article number: 39
Article number: 40
Article number: 41
Articl

Article number: 2181
Article number: 2182
Article number: 2183
Article number: 2184
Article number: 2185
Article number: 2186
Article number: 2187
Article number: 2188
Article number: 2189
Article number: 2190
Article number: 2191
Article number: 2192
Article number: 2193
Article number: 2194
Article number: 2195
Article number: 2196
Article number: 2197
Article number: 2198
Article number: 2199
Article number: 2200
Article number: 2201
Article number: 2202
Article `download()` failed with 404 Client Error: Not Found for url: https://www.tagesspiegel.de/meinung/verunglueckte-netz-kampagne-alles-dicht-machen-ist-so-schaebig-dass-es-weh-tut/27123990.html on URL https://www.tagesspiegel.de/meinung/verunglueckte-netz-kampagne-alles-dicht-machen-ist-so-schaebig-dass-es-weh-tut/27123990.html
Article number: 2203
Article `download()` failed with 404 Client Error: Not Found for url: https://m.tagesspiegel.de/meinung/verunglueckte-netz-kampagne-alles-dicht-machen-ist-so-schaebig-dass-es-weh-tut

Article number: 2491
Article number: 2492
Article number: 2493
Article number: 2494
Article number: 2495
Article number: 2496
Article number: 2497
Article number: 2498
Article number: 2499
Article number: 2500
Article number: 2501
Article number: 2502
Article number: 2503
Article number: 2504
Article number: 2505
Article number: 2506
Article number: 2507
Article number: 2508
Article number: 2509
Article number: 2510
Article number: 2511
Article number: 2512
Article number: 2513
Article number: 2514
Article number: 2515
Article number: 2516
Article number: 2517
Article number: 2518
Article number: 2519
Article number: 2520
Article number: 2521
Article number: 2522
Article number: 2523
Article number: 2524
Article number: 2525
Article number: 2526
Article `download()` failed with 404 Client Error: Not Found for url: https://www.tagesschau.de/sport/sportschau/sportschau-story-39467.html on URL https://www.tagesschau.de/sport/sportschau/sportschau-story-39467.html
Article number: 2527
Arti

Article number: 2860
Article number: 2861
Article number: 2862
Article number: 2863
Article number: 2864
Article number: 2865
Article number: 2866
Article number: 2867
Article number: 2868
Article number: 2869
Article number: 2870
Article number: 2871
Article number: 2872
Article number: 2873
Article number: 2874
Article number: 2875
Article number: 2876
Article number: 2877
Article number: 2878
Article number: 2879
Article number: 2880
Article number: 2881
Article number: 2882
Article number: 2883
Article number: 2884
Article number: 2885
Article number: 2886
Article number: 2887
Article number: 2888
Article number: 2889
Article number: 2890
Article number: 2891
Article number: 2892
Article number: 2893
Article number: 2894
Article number: 2895
Article number: 2896
Article number: 2897
Article number: 2898
Article number: 2899
Article number: 2900
Article number: 2901
Article number: 2902
Article number: 2903
Article number: 2904
Article number: 2905
Article number: 2906
Article numbe

Article number: 3195
Article number: 3196
Article number: 3197
Article number: 3198
Article number: 3199
Article number: 3200
Article number: 3201
Article number: 3202
Article number: 3203
Article number: 3204
Article number: 3205
Article number: 3206
Article number: 3207
Article number: 3208
Article number: 3209
Article number: 3210
Article number: 3211
Article number: 3212
Article number: 3213
Article number: 3214
Article number: 3215
Article number: 3216
Article number: 3217
Article number: 3218
Article number: 3219
Article number: 3220
Article number: 3221
Article number: 3222
Article number: 3223
Article number: 3224
Article number: 3225
Article number: 3226
Article number: 3227
Article number: 3228
Article number: 3229
Article number: 3230
Article number: 3231
Article number: 3232
Article number: 3233
Article number: 3234
Article number: 3235
Article number: 3236
Article number: 3237
Article number: 3238
Article number: 3239
Article number: 3240
Article number: 3241
Article numbe

Article number: 3556
Article number: 3557
Article number: 3558
Article number: 3559
Article number: 3560
Article number: 3561
Article number: 3562
Article number: 3563
Article number: 3564
Article number: 3565
Article number: 3566
Article number: 3567
Article number: 3568
Article number: 3569
Article number: 3570
Article number: 3571
Article number: 3572
Article number: 3573
Article number: 3574
Article number: 3575
Article number: 3576
Article number: 3577
Article number: 3578
Article number: 3579
Article number: 3580
Article number: 3581
Article number: 3582
Article number: 3583
Article number: 3584
Article number: 3585
Article number: 3586
Article number: 3587
Article number: 3588
Article number: 3589
Article number: 3590
Article number: 3591
Article number: 3592
Article number: 3593
Article number: 3594
Article number: 3595
Article number: 3596
Article number: 3597
Article number: 3598
Article number: 3599
Article number: 3600
Article number: 3601
Article number: 3602
Article numbe

In [11]:
df_bkp = df.copy()
df_bkp.tail()

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords
3799,https://www.welt.de/regionales/hamburg/article...,welt.de,,Polizei startet sechste G20-Öffentlichkeitsfah...,Hamburg (dpa/lno) - Zwei Jahre nach den Aussch...,Hamburg (dpa/lno) - Zwei Jahre nach den Aussch...,"[polizei, jahre, öffentlichkeitsfahndung, g20ö..."
3800,https://www.zeit.de/gesellschaft/zeitgeschehen...,zeit.de,,Versammlungsfreiheit : Im Zweifel für die Frei...,"Es sind wuchtige, fast pathetische Sätze, die ...","""Das Recht, sich ungehindert [...] zu versamme...","[zeiten, corona, virus, verwaltungsgericht, br..."
3801,https://www.welt.de/politik/deutschland/articl...,welt.de,,Demonstrationen in Berlin: Polizei will aktiv ...,Dass bei Demonstrationen gegen Trumps Jerusale...,Dass bei Demonstrationen gegen Trumps Jerusale...,"[strafbar, aktiv, demonstration, juden, verbra..."
3802,https://www.spiegel.de/panorama/justiz/us-panz...,spiegel.de,,Sachsen: Polizei stoppt Konvoi mit US-Panzern ...,Eine Streife der Verkehrspolizei hat auf der A...,Eine Streife der Verkehrspolizei hat auf der A...,"[konvoi, uspanzern, transportkonvoi, typ, baut..."
3803,https://www.tagesspiegel.de/images/fdp_keinesa...,tagesspiegel.de,,,,,


In [12]:
df.isna().sum()

expanded_url       0
domain             0
tag             3804
title            216
text             216
summary          216
keywords         216
dtype: int64

### 2.4 removing record if keywords field is null

In [13]:
#remove all rows which has no keywords and reset index
df = df[~df['keywords'].isnull()]
df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords
count,3588,3588,0.0,3588,3588,3588,3588
unique,3588,5,0.0,2375,2189,2169,2387
top,https://www.welt.de/politik/deutschland/articl...,spiegel.de,,Aktuelle Nachrichten aus Deutschland,Hauptnavigation: Nutzen Sie die Tabulatortaste...,Hauptnavigation: Nutzen Sie die Tabulatortaste...,"[koalitionspartner, nachrichten, union, stimmt..."
freq,1,838,,162,179,179,89


In [14]:
df.isna().sum()

expanded_url       0
domain             0
tag             3588
title              0
text               0
summary            0
keywords           0
dtype: int64

In [15]:
df.expanded_url.nunique()

3588

### 2.5 removing rows with .pdf/.jpg/.png links

In [16]:
#add code for removing pdf links
df_bkp_2 = df.copy()

df=df[~((df.expanded_url.str.endswith(".pdf")) | (df.expanded_url.str.endswith(".jpg")) | (df.expanded_url.str.endswith(".png")))]
#reset index
df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords
count,3578,3578,0.0,3578,3578,3578,3578
unique,3578,5,0.0,2374,2189,2169,2386
top,https://www.welt.de/politik/deutschland/articl...,spiegel.de,,Aktuelle Nachrichten aus Deutschland,Hauptnavigation: Nutzen Sie die Tabulatortaste...,Hauptnavigation: Nutzen Sie die Tabulatortaste...,"[koalitionspartner, nachrichten, union, stimmt..."
freq,1,838,,162,179,179,89


In [17]:
df.isna().sum()

expanded_url       0
domain             0
tag             3578
title              0
text               0
summary            0
keywords           0
dtype: int64

### 2.6 load library - word2vec

In [18]:
# !pip install --upgrade gensim

In [20]:
#load word2vec
from gensim.models import KeyedVectors
vecs = KeyedVectors.load_word2vec_format('../model/wiki.de.vec', binary=False)

### 2.7 calculate vector average

In [22]:
df["vec_sum_avg"] = np.array

for i in range(len(df["vec_sum_avg"])):    
    print("******* Article number: "+str(i)+" **********")
    num = 0
    vec_sum = 0
    
    for j in range(len(df["keywords"][i])):
        try:
            #vec_temp = vecs.word_vec(df["keywords"][i][j], use_norm=False)        
            vec_temp = vecs.word_vec(df["keywords"][i][j])
        except Exception as e:
            print(e)        
        else:
            num = num + 1
            vec_sum = vec_sum + vec_temp
            
    try:
        vec_avg = vec_sum/num
        df["vec_sum_avg"][i] = vec_avg
    except Exception as e:        
        print(e)    

******* Article number: 0 **********
"Key 'asylzuwanderern' not present"
******* Article number: 1 **********
******* Article number: 2 **********
"Key '40' not present"
"Key 'escapetastehauptnavigation' not present"
"Key 'zdfpolitbarometer' not present"
"Key 'leertasteprodukte' not present"
******* Article number: 3 **********
******* Article number: 4 **********
******* Article number: 5 **********
"Key 'aidspandemie' not present"
"Key 'aidskrise' not present"
"Key 'coronakrise' not present"
"Key 'covid19' not present"
******* Article number: 6 **********
******* Article number: 7 **********
"Key 'afdflügel' not present"
******* Article number: 8 **********
"Key 'coronademo' not present"
"Key 'coronazeiten' not present"
******* Article number: 9 **********
"Key '2018' not present"
******* Article number: 10 **********
"Key 'täteropfergemengelage' not present"
"Key 'stasiarchiv' not present"
"Key 'stasifunktionärs' not present"
******* Article number: 11 **********
"Key 'coronatests' 

  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the 

******* Article number: 170 **********
"Key 'überprüfungswege' not present"
"Key 'coronaskeptiker' not present"
******* Article number: 171 **********
"Key 'snewsletter' not present"
"Key 'dflkonzept' not present"
"Key 'spiegelde' not present"
******* Article number: 172 **********
"Key '60' not present"
******* Article number: 173 **********
"Key 'covid19' not present"
******* Article number: 174 **********
******* Article number: 175 **********
******* Article number: 176 **********
"Key 'reiserückkehrer' not present"
"Key 'testpflicht' not present"
"Key 'quarantäneregime' not present"
******* Article number: 177 **********
"Key '10' not present"
"Key 'krampkarrenbauer' not present"
******* Article number: 178 **********
******* Article number: 179 **********
"Key 'usatombomben' not present"
"Key 'f18' not present"
"Key 'tornadonachfolge' not present"
"Key 'werdenzudem' not present"
"Key 'tornadoflotte' not present"
"Key 'usherstellers' not present"
******* Article number: 180 ******

  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the 

"Key 'weiteralle' not present"
******* Article number: 397 **********
******* Article number: 398 **********
"Key 'occupycamp' not present"
"Key 'occupybewegung' not present"
******* Article number: 399 **********
"Key 'coronapandemie' not present"
"Key 'tegnell' not present"
"Key 'schulkurs' not present"
******* Article number: 400 **********
******* Article number: 401 **********
"Key 'leertasteprodukte' not present"
"Key 'escapetastehauptnavigation' not present"
******* Article number: 402 **********
******* Article number: 403 **********
"Key 'snewsletter' not present"
"Key 'spiegelde' not present"
******* Article number: 404 **********
"Key 'psgniederlage' not present"
"Key 'warrandalierende' not present"
******* Article number: 405 **********
"Key 'coronainfizierter' not present"
******* Article number: 406 **********
******* Article number: 407 **********
"Key '1' not present"
******* Article number: 408 **********
"Key 'sarscov2virus' not present"
"Key 'covid19' not present"
**

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never 

******* Article number: 685 **********
******* Article number: 686 **********
"Key 'coronademo' not present"
"Key 'weltplus' not present"
******* Article number: 687 **********
"Key '38000' not present"
"Key 'coronapolitik' not present"
******* Article number: 688 **********
"Key '242000' not present"
"Key 'unteilbardemo' not present"
******* Article number: 689 **********
"Key 'wirdimmer' not present"
"Key 'faktenfinder' not present"
******* Article number: 690 **********
******* Article number: 691 **********
"Key 'anticoronademonstration' not present"
******* Article number: 692 **********
"Key 'querdenkenbewegung' not present"
"Key 'compactchef' not present"
******* Article number: 693 **********
"Key 'coronademonstrationen' not present"
******* Article number: 694 **********
"Key 'westlobby' not present"
"Key 'weltplus' not present"
******* Article number: 695 **********
"Key 'querdenkerdemonstration' not present"
"Key 'coronaprotest' not present"
******* Article number: 696 *****

  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the 

******* Article number: 1064 **********
"Key 'seienfür' not present"
"Key 'teststellen' not present"
"Key 'freitesten' not present"
"Key 'coronaviruspandemie' not present"
******* Article number: 1065 **********
"Key 'escapetastehauptnavigation' not present"
"Key 'leertasteprodukte' not present"
******* Article number: 1066 **********
"Key 'maskenpflicht' not present"
******* Article number: 1067 **********
"Key 'verschwörungstheoretikerinnen' not present"
"Key 'coronakritiker' not present"
"Key 'coronademos' not present"
******* Article number: 1068 **********
"Key 'maskenpflicht' not present"
"Key 'coronademos' not present"
******* Article number: 1069 **********
"Key 'maskenpflicht' not present"
"Key 'coronaimmunitätsstudie' not present"
******* Article number: 1070 **********
******* Article number: 1071 **********
******* Article number: 1072 **********
"Key 'coronamaßnahmen' not present"
"Key 'maskenpflicht' not present"
"Key 'mundnasenschutz' not present"
******* Article number:

  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the 

******* Article number: 1459 **********
******* Article number: 1460 **********
"Key 'coronakrise' not present"
"Key '19' not present"
"Key 'coronanews' not present"
******* Article number: 1461 **********
"Key 'spaßvideos' not present"
******* Article number: 1462 **********
******* Article number: 1463 **********
"Key 'megapower' not present"
"Key 'maskenverweigerer' not present"
"Key 'risikopiloten' not present"
"Key 'coronakrise' not present"
******* Article number: 1464 **********
******* Article number: 1465 **********
"Key 'kokainspuren' not present"
******* Article number: 1466 **********
******* Article number: 1467 **********
"Key '100' not present"
"Key '1916' not present"
******* Article number: 1468 **********
"Key 'coronademo' not present"
"Key 'uspräsidenten' not present"
******* Article number: 1469 **********
******* Article number: 1470 **********
"Key 'lübcketweet' not present"
"Key 'werteunion' not present"
******* Article number: 1471 **********
"Key 'rwert' not pr

  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the 

"Key 'coronaimpfstoff' not present"
******* Article number: 1694 **********
******* Article number: 1695 **********
"Key 'afdwähler' not present"
******* Article number: 1696 **********
"Key 'escapetastehauptnavigation' not present"
"Key 'leertasteprodukte' not present"
******* Article number: 1697 **********
"Key 'zurückmeldendies' not present"
******* Article number: 1698 **********
"Key 'coronakrise' not present"
"Key 'covid19' not present"
******* Article number: 1699 **********
"Key 'ökostromforderung' not present"
******* Article number: 1700 **********
"Key 'coronainfektion' not present"
"Key 'scheibenbogen' not present"
******* Article number: 1701 **********
"Key 'werdentom' not present"
"Key 'weltinterview' not present"
"Key 'allesdichtmachen' not present"
******* Article number: 1702 **********
"Key 'coronadebatte' not present"
"Key 'allesdichtmachen' not present"
"Key '50' not present"
******* Article number: 1703 **********
"Key 'rüstungsboom' not present"
"Key 'zügein' no

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never 

"Key 'allesdichtmachen' not present"
"Key 'coronakritik' not present"
******* Article number: 2011 **********
"Key 'allesdichtmachen' not present"
******* Article number: 2012 **********
"Key 'rüstungsboom' not present"
"Key 'zügein' not present"
******* Article number: 2013 **********
"Key 'maskenpflicht' not present"
"Key 'coronaimmunitätsstudie' not present"
******* Article number: 2014 **********
"Key 'covid19' not present"
******* Article number: 2015 **********
******* Article number: 2016 **********
******* Article number: 2017 **********
"Key 'saynwittgenstein' not present"
******* Article number: 2018 **********
"Key 'kampfdelfine' not present"
"Key 'krimdelfine' not present"
******* Article number: 2019 **********
"Key 'allesdichtmachen' not present"
"Key 'coronakritik' not present"
******* Article number: 2020 **********
"Key 'anastasiabewegung' not present"
******* Article number: 2021 **********
******* Article number: 2022 **********
"Key 'allesdichtmachen' not present"
"

  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the 

******* Article number: 2168 **********
"Key 'anhusten' not present"
******* Article number: 2169 **********
******* Article number: 2170 **********
******* Article number: 2171 **********
******* Article number: 2172 **********
"Key 'stylingtipps' not present"
"Key 'dauerabpuderabo' not present"
******* Article number: 2173 **********
"Key '16' not present"
"Key 'escapetastehauptnavigation' not present"
"Key 'leertasteprodukte' not present"
******* Article number: 2174 **********
"Key 'weiterlesendieses' not present"
"Key 'nutzenauf' not present"
"Key 'spiegelinhalten' not present"
******* Article number: 2175 **********
******* Article number: 2176 **********
"Key 'waldorfschulendies' not present"
******* Article number: 2177 **********
"Key 'coronaimpfungen' not present"
******* Article number: 2178 **********
"Key 'eegumlage' not present"
"Key 'weltplus' not present"
******* Article number: 2179 **********
******* Article number: 2180 **********
"Key 'coronainzidenz' not present"
*

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




******* Article number: 2687 **********
"Key '60' not present"
"Key 'impftermin' not present"
******* Article number: 2688 **********
******* Article number: 2689 **********
"Key 'wissenschaftwie' not present"
******* Article number: 2690 **********
"Key 'coronakarten' not present"
******* Article number: 2691 **********
******* Article number: 2692 **********
"Key 'escapetastehauptnavigation' not present"
"Key 'leertasteprodukte' not present"
"Key 'excharitéchefvirologe' not present"
******* Article number: 2693 **********
******* Article number: 2694 **********
"Key 'nsuuntersuchung' not present"
******* Article number: 2695 **********
"Key 'rüstungsboom' not present"
"Key 'zügein' not present"
******* Article number: 2696 **********
"Key 'escapetastehauptnavigation' not present"
"Key 'leertasteprodukte' not present"
******* Article number: 2697 **********
"Key 'coronagetestete' not present"
******* Article number: 2698 **********
"Key 'covid19' not present"
******* Article number: 

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never 

******* Article number: 3104 **********
******* Article number: 3105 **********
"Key 'impfgipfel' not present"
"Key 'coronaleugner' not present"
******* Article number: 3106 **********
"Key 'coronapandemie' not present"
"Key 'lockdowns' not present"
"Key 'escapetastehauptnavigation' not present"
"Key 'leertasteprodukte' not present"
******* Article number: 3107 **********
"Key 'impfneid' not present"
******* Article number: 3108 **********
"Key 'walterborjans' not present"
******* Article number: 3109 **********
******* Article number: 3110 **********
"Key 'warmdie' not present"
******* Article number: 3111 **********
******* Article number: 3112 **********
******* Article number: 3113 **********
"Key 'coronamaskenpflicht' not present"
"Key 'maskenpflicht' not present"
"Key 'maskenurteils' not present"
******* Article number: 3114 **********
******* Article number: 3115 **********
"Key 'snewsletter' not present"
"Key 'spiegelde' not present"
******* Article number: 3116 **********
"Key

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["vec_sum_avg"][i] = vec_avg
  vec_temp = vecs.word_vec(df["keywords"][i][j])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never 

******* Article number: 809 **********
******* Article number: 810 **********
"word 'coronademo' not in vocabulary"
"word 'coronapolitik' not in vocabulary"
******* Article number: 811 **********
"word 'demoverbot' not in vocabulary"
******* Article number: 812 **********
"word 'greenpeaceaktivisten' not in vocabulary"
"word '2038' not in vocabulary"
******* Article number: 813 **********
"word '308' not in vocabulary"
"word 'maskenpflicht' not in vocabulary"
"word 'coronavirusnews' not in vocabulary"
******* Article number: 814 **********
"word 'reichstagstreppe' not in vocabulary"
******* Article number: 815 **********
"word 'coronademonstration' not in vocabulary"
******* Article number: 816 **********
"word 'coronademo' not in vocabulary"
"word 'demoblog' not in vocabulary"
******* Article number: 817 **********
"word '200' not in vocabulary"
"word 'überfordertpolitiker' not in vocabulary"
"word 'reichstagstreppe' not in vocabulary"
"word 'coronagroßdemo' not in vocabulary"
*******

******* Article number: 1045 **********
"word 'coronademonstration' not in vocabulary"
******* Article number: 1046 **********
"word 'coronainfektion' not in vocabulary"
"word 'robertkochinstitut' not in vocabulary"
"word 'covid19' not in vocabulary"
******* Article number: 1047 **********
"word 'coronademonstration' not in vocabulary"
******* Article number: 1048 **********
"word 'coronademo' not in vocabulary"
"word 'freiheitlichdemokratischen' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
******* Article number: 1049 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1050 **********
"word '160' not in vocabulary"
******* Article number: 1051 **********
"word 'coronamaßnahmen' not in vocabulary"
"word 'coronademonstrationen' not in vocabulary"
******* Article number: 1052 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' no

******* Article number: 1296 **********
"word '3439' not in vocabulary"
"word 'siebentageinzidenz' not in vocabulary"
"word 'todesfälleunter' not in vocabulary"
"word 'intensivbettenampel' not in vocabulary"
******* Article number: 1297 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1298 **********
"word 'politikerranking' not in vocabulary"
"word 'coronakrise' not in vocabulary"
******* Article number: 1299 **********
"word 'coronapandemie' not in vocabulary"
"word 'hochzeitenso' not in vocabulary"
******* Article number: 1300 **********
"word '3230' not in vocabulary"
******* Article number: 1301 **********
"word 'coronaviruspandemie' not in vocabulary"
******* Article number: 1302 **********
"word 'qanon' not in vocabulary"
"word 'verschwörungsgläubigen' not in vocabulary"
"word 'qanonanhänger' not in vocabulary"
******* Article number: 1303 **********
"word '2' not in vocabulary"
"wor

******* Article number: 1439 **********
******* Article number: 1440 **********
"word 'passiertim' not in vocabulary"
"word 'isrückkehrer' not in vocabulary"
******* Article number: 1441 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1442 **********
"word 'covid19toten' not in vocabulary"
"word '97' not in vocabulary"
"word 'coronavirustoten' not in vocabulary"
"word 'covid19' not in vocabulary"
******* Article number: 1443 **********
"word 'saibous' not in vocabulary"
"word 'coronademo' not in vocabulary"
******* Article number: 1444 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1445 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1446 **********
"word 'wollenvon' not in vocabulary"
"word 'sta

******* Article number: 1607 **********
"word '1918' not in vocabulary"
"word '1917' not in vocabulary"
******* Article number: 1608 **********
******* Article number: 1609 **********
"word 'covid19' not in vocabulary"
******* Article number: 1610 **********
"word '40000' not in vocabulary"
"word '10000' not in vocabulary"
******* Article number: 1611 **********
******* Article number: 1612 **********
"word 'frontkämpferarzt' not in vocabulary"
******* Article number: 1613 **********
******* Article number: 1614 **********
******* Article number: 1615 **********
"word 'coronamaßnahmen' not in vocabulary"
******* Article number: 1616 **********
******* Article number: 1617 **********
"word 'terrassenplätze' not in vocabulary"
"word 'restaurantsterben' not in vocabulary"
"word 'heizpilze' not in vocabulary"
******* Article number: 1618 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1619 ***

"word 'unternehmertumsein' not in vocabulary"
"word 'maskenproduktion' not in vocabulary"
"word 'whatsappnachrichten' not in vocabulary"
******* Article number: 1803 **********
"word '10' not in vocabulary"
******* Article number: 1804 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1805 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1806 **********
******* Article number: 1807 **********
******* Article number: 1808 **********
"word 'uswahlrecht' not in vocabulary"
******* Article number: 1809 **********
"word 'medikamentenklau' not in vocabulary"
******* Article number: 1810 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1811 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in v

******* Article number: 1947 **********
******* Article number: 1948 **********
"word '30000' not in vocabulary"
******* Article number: 1949 **********
******* Article number: 1950 **********
"word 'bertelsmannstudie' not in vocabulary"
"word 'spdexperte' not in vocabulary"
******* Article number: 1951 **********
"word 'coronabeschränkungen' not in vocabulary"
******* Article number: 1952 **********
"word 'allesdichtmachen' not in vocabulary"
"word 'youtubeum' not in vocabulary"
******* Article number: 1953 **********
"word 'weiterlesenklicken' not in vocabulary"
"word 'werdenhier' not in vocabulary"
"word 'spiegelzugang' not in vocabulary"
"word 'genutztspiegel' not in vocabulary"
******* Article number: 1954 **********
"word 'coronakritik' not in vocabulary"
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 1955 **********
"word 'mdrrauswurf' not in vocabulary"
******* Article number: 1956 **********
******* Article number: 1957 **********
"word 'zeitde' not in voc

******* Article number: 2047 **********
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 2048 **********
"word 'ardsatire' not in vocabulary"
******* Article number: 2049 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2050 **********
"word 'westenindien' not in vocabulary"
******* Article number: 2051 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2052 **********
"word 'iwfchefin' not in vocabulary"
"word '2007' not in vocabulary"
******* Article number: 2053 **********
"word 'polizeimitarbeiterin' not in vocabulary"
******* Article number: 2054 **********
"word 'coronaopfern' not in vocabulary"
******* Article number: 2055 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 2056 **********
"word 'ausbildungsgehälter' not in vocabulary"
"word 'mind

******* Article number: 2167 **********
"word 'siebentageinzidenz' not in vocabulary"
******* Article number: 2168 **********
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 2169 **********
"word 'allesschlichtmachen' not in vocabulary"
******* Article number: 2170 **********
******* Article number: 2171 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2172 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2173 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2174 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2175 **********
"word 'zahlenkosmetik' not in vocabulary"
******* Article n

******* Article number: 2365 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 2366 **********
******* Article number: 2367 **********
"word 'sterbefallzahlen' not in vocabulary"
******* Article number: 2368 **********
"word 'coronakrise' not in vocabulary"
******* Article number: 2369 **********
******* Article number: 2370 **********
"word 'doppelmutante' not in vocabulary"
******* Article number: 2371 **********
******* Article number: 2372 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2373 **********
******* Article number: 2374 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2375 **********
"word 'winterwelle' not in vocabulary"
******* Article number: 2376 **********
"word 'lockdowns' not in vocabulary"
******* Article number: 2377 **********
******* Article 

"word 'fdjvergangenheit' not in vocabulary"
******* Article number: 2523 **********
"word 'stattdie' not in vocabulary"
******* Article number: 2524 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2525 **********
"word 'lockdowns' not in vocabulary"
"word 'wissenschaftsleugnung' not in vocabulary"
******* Article number: 2526 **********
"word 'unglücklichder' not in vocabulary"
"word 'ändernerzbistum' not in vocabulary"
******* Article number: 2527 **********
"word 'querdenkendemos' not in vocabulary"
"word '21000' not in vocabulary"
******* Article number: 2528 **********
"word 'coronakrise' not in vocabulary"
******* Article number: 2529 **********
"word 'biontech' not in vocabulary"
"word 'b1351' not in vocabulary"
******* Article number: 2530 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number:

"word 'genutztspiegel' not in vocabulary"
******* Article number: 2661 **********
"word 'milliardengewinn' not in vocabulary"
******* Article number: 2662 **********
"word 'tabakverbot' not in vocabulary"
******* Article number: 2663 **********
"word 'überhauptbayern' not in vocabulary"
"word 'teilimpfpflicht' not in vocabulary"
******* Article number: 2664 **********
"word 'coronalage' not in vocabulary"
******* Article number: 2665 **********
******* Article number: 2666 **********
"word 'winterwelle' not in vocabulary"
******* Article number: 2667 **********
"word 'querdenkenbewegung' not in vocabulary"
"word 'querdenkerdemo' not in vocabulary"
******* Article number: 2668 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2669 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2670 **********
"w

******* Article number: 2817 **********
"word '32' not in vocabulary"
******* Article number: 2818 **********
"word 'coronatests' not in vocabulary"
******* Article number: 2819 **********
"word 'covid19' not in vocabulary"
******* Article number: 2820 **********
"word 'csupolitiker' not in vocabulary"
"word 'coronaschnelltest' not in vocabulary"
"word '300000' not in vocabulary"
******* Article number: 2821 **********
"word 'coronahotspot' not in vocabulary"
******* Article number: 2822 **********
******* Article number: 2823 **********
"word 'miniherbergen' not in vocabulary"
******* Article number: 2824 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2825 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2826 **********
"word 'maskenpflicht' not in vocabulary"
"word 'coronaregeln' not in voca

******* Article number: 2984 **********
"word 'vätermonate' not in vocabulary"
******* Article number: 2985 **********
******* Article number: 2986 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2987 **********
"word 'ausrüstungsmangel' not in vocabulary"
******* Article number: 2988 **********
******* Article number: 2989 **********
******* Article number: 2990 **********
"word 'maskenpflicht' not in vocabulary"
******* Article number: 2991 **********
"word 'coronamaßnahmen' not in vocabulary"
******* Article number: 2992 **********
"word 'coronainfektion' not in vocabulary"
******* Article number: 2993 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2994 **********
"word 'selbstentlastungszeuge' not in vocabulary"
"word 'wirecarduntersuchungsausschuss' not in vocabulary"
******* Article num

******* Article number: 3085 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3086 **********
"word 'querdenkerdemos' not in vocabulary"
"word 'bundesnotbremse' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
"word 'zusammengetragentagesspiegelunser' not in vocabulary"
"word '250' not in vocabulary"
******* Article number: 3087 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3088 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3089 **********
"word 'querdenkerdemos' not in vocabulary"
"word 'bundesnotbremse' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
"word 'zusammengetragentagesspiegelunser' not in vocabulary"
"word '250' not in vocabulary"
******* Article number: 309

******* Article number: 3172 **********
"word 'coronademos' not in vocabulary"
"word 'querdenkerprotest' not in vocabulary"
******* Article number: 3173 **********
"word 'coronademos' not in vocabulary"
"word 'querdenkerprotest' not in vocabulary"
******* Article number: 3174 **********
"word 'siebentageinzidenz' not in vocabulary"
"word 'rkiprognose' not in vocabulary"
******* Article number: 3175 **********
"word 'querdenkendemos' not in vocabulary"
******* Article number: 3176 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3177 **********
"word 'winterwelle' not in vocabulary"
******* Article number: 3178 **********
"word 'netzwerkertelegram' not in vocabulary"
"word 'telegramgruppen' not in vocabulary"
"word 'tonlinerecherchen' not in vocabulary"
******* Article number: 3179 **********
"word 'querdenkendemos' not in vocabulary"
******* Article number: 3180 **********
"word 'querdenken

******* Article number: 3279 **********
"word 'coronapandemie' not in vocabulary"
"word 'hochzeitenso' not in vocabulary"
******* Article number: 3280 **********
"word 'coronapandemie' not in vocabulary"
"word 'hochzeitenso' not in vocabulary"
******* Article number: 3281 **********
"word 'oscarbeitrager' not in vocabulary"
"word 'sciencefictionfilm' not in vocabulary"
"word 'oscarbewerbung' not in vocabulary"
******* Article number: 3282 **********
******* Article number: 3283 **********
"word 'impfgipfel' not in vocabulary"
******* Article number: 3284 **********
"word 'coronaschulpolitik' not in vocabulary"
******* Article number: 3285 **********
"word 'coronapolitik' not in vocabulary"
******* Article number: 3286 **********
"word '130000' not in vocabulary"
******* Article number: 3287 **********
"word 'lucaapp' not in vocabulary"
"word 'kontaktnachverfolgung' not in vocabulary"
"word 'lucasystem' not in vocabulary"
******* Article number: 3288 **********
"word 'impfgipfel' not in

******* Article number: 3405 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3406 **********
"word 'pandemiebeginn' not in vocabulary"
"word '100000' not in vocabulary"
******* Article number: 3407 **********
"word 'verfassungabtreibungen' not in vocabulary"
"word 'usstaaten' not in vocabulary"
******* Article number: 3408 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3409 **********
"word 'coronaopfer' not in vocabulary"
******* Article number: 3410 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 3411 **********
"word 'coronahochburg' not in vocabulary"
******* Article number: 3412 **********
"word 'exfdppolitiker' not in vocabulary"
******* Article number: 3413 **********
"word 'maasmitarbeiterin' not in vocabulary"
******* Article number: 3414 **********
"word

******* Article number: 3514 **********
"word '19jährigen' not in vocabulary"
"word 'qosay' not in vocabulary"
******* Article number: 3515 **********
"word 'tagesschaude' not in vocabulary"
******* Article number: 3516 **********
******* Article number: 3517 **********
"word 'kinderhass' not in vocabulary"
******* Article number: 3518 **********
******* Article number: 3519 **********
"word 'querdenkendemo' not in vocabulary"
******* Article number: 3520 **********
"word 'querdenkenbewegung' not in vocabulary"
******* Article number: 3521 **********
******* Article number: 3522 **********
"word 'coronaschwindel' not in vocabulary"
"word 'verschwörungslegenden' not in vocabulary"
******* Article number: 3523 **********
"word 'genderdoppelpunkt' not in vocabulary"
"word 'gendersternchen' not in vocabulary"
******* Article number: 3524 **********
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 3525 **********
"word 'querdenkerpartei' not in vocabulary"
"word 'allesdic

******* Article number: 3615 **********
******* Article number: 3616 **********
******* Article number: 3617 **********
"word 'coronawelle' not in vocabulary"
"word 'intensivpflegekräfte' not in vocabulary"
******* Article number: 3618 **********
"word 'maskenurteil' not in vocabulary"
"word 'maskenpflicht' not in vocabulary"
******* Article number: 3619 **********
******* Article number: 3620 **********
"word 'infektionsausbruch' not in vocabulary"
"word 'r1' not in vocabulary"
"word 'uspflegeheim' not in vocabulary"
******* Article number: 3621 **********
"word '200' not in vocabulary"
"word 'coronamaßnahmen' not in vocabulary"
"word 'schulnotbremse' not in vocabulary"
******* Article number: 3622 **********
"word '11644' not in vocabulary"
"word 'coronaneuinfektionen' not in vocabulary"
******* Article number: 3623 **********
******* Article number: 3624 **********
******* Article number: 3625 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werb

******* Article number: 3716 **********
******* Article number: 3717 **********
"word 'loveparadekatastrophe' not in vocabulary"
******* Article number: 3718 **********
"word '274441' not in vocabulary"
******* Article number: 3719 **********
"word 'coronamaßnahmen' not in vocabulary"
"word 'querfrontdemonstration' not in vocabulary"
******* Article number: 3720 **********
******* Article number: 3721 **********
"word 'coronakrise' not in vocabulary"
******* Article number: 3722 **********
"word 'afdabgeordneten' not in vocabulary"
******* Article number: 3723 **********
"word 'coronademo' not in vocabulary"
"word 'demoblog' not in vocabulary"
******* Article number: 3724 **********
"word 'klimastreik' not in vocabulary"
******* Article number: 3725 **********
"word 'coronapolitik' not in vocabulary"
******* Article number: 3726 **********
"word '2012' not in vocabulary"
******* Article number: 3727 **********
"word 'coronabeschränkungen' not in vocabulary"
******* Article number: 3728

In [23]:
df.isna().sum()

expanded_url       0
domain             0
tag             3578
title              0
text               0
summary            0
keywords           0
vec_sum_avg        0
dtype: int64

In [24]:
df.sample(5)

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords,vec_sum_avg
2465,https://www.welt.de/politik/deutschland/articl...,welt.de,,Chefvirologe Drosten: „Die PCR-Tests für Sars-...,Christian Drosten äußert sich gegenüber WELT z...,Frage Im Blick auf die PCR-Testung bin ich auf...,"[sarscov2, pcr, validiert, pcrtests, pcrtestun...","[-0.22553714, 0.45476642, -0.19519208, -0.2506..."
41,https://www.spiegel.de/wissenschaft/medizin/co...,spiegel.de,,"Corona und Schul-Öffnungen: ""Meist tragen die ...",Johannes Hübner: Da gibt es aus meiner Sicht k...,Johannes Hübner: Da gibt es aus meiner Sicht k...,"[schulöffnungen, zeit, corona, virus, kinder, ...","[-0.20359243, 0.019481739, -0.19085144, -0.249..."
349,https://www.zeit.de/2015/37/bayern-csu-markus-...,zeit.de,,"CSU: ""Markus Söder sah sich immer in Konkurren...","In den Zeitungen wird Söder als ""smarter Flach...","In den Zeitungen wird Söder als ""smarter Flach...","[seehofer, stammen, gruppe, schwächen, zerfres...","[-0.15644912, 0.27448708, -0.25889215, -0.2983..."
1010,https://www.welt.de/politik/article214305266/K...,welt.de,,Koalitionsausschuss: Einigung auf Verkleinerun...,Nach achtstündigen Verhandlungen einigen sich ...,Nach achtstündigen Verhandlungen einigen sich ...,"[bundestages, coronamaßnahmen, 2021, wahlrecht...","[-0.1166282, 0.2793043, -0.028726127, -0.56611..."
851,https://www.tagesspiegel.de/berlin/corona-demo...,tagesspiegel.de,,Demo-Blog für Berlin: Ermittlungen gegen Poliz...,Im Nachgang zu den Demonstrationen von Gegnern...,Im Nachgang zu den Demonstrationen von Gegnern...,"[ziehen, polizei, ermittlungen, coronademo, po...","[-0.25588098, 0.11098828, -0.29753146, -0.2453..."


# 3. Modeling

In [25]:
from sklearn.cluster import KMeans

X = pd.DataFrame(df['vec_sum_avg'].to_list())
clusters = KMeans(n_clusters=2, random_state=0).fit_predict(X)

In [26]:
df['cluster'] = clusters
df.sample(5)

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords,vec_sum_avg,cluster
1871,https://www.tagesspiegel.de/wissen/vom-hotspot...,tagesspiegel.de,,Vom Hotspot zum Vorbild: So gelang Portugal di...,Es war ein Bild mit Symbolcharakter: 24 Soldat...,Empfohlener redaktioneller Inhalt An dieser St...,"[portugal, coronakehrtwende, januar, gelang, l...","[-0.29549357, 0.09161783, -0.15772854, -0.2487...",0
3257,https://www.zeit.de/gesundheit/2021-04/corona-...,zeit.de,,Corona-Impfstoffe: Ist der Vektor das Problem?,Bis Ende Juni erwartet die Europäische Union 5...,Bis Ende Juni erwartet die Europäische Union 5...,"[antikörper, fälle, april, millionen, seltenen...","[-0.18002833, 0.16217835, -0.22493042, -0.1895...",0
390,https://www.tagesspiegel.de/politik/bip-sinkt-...,tagesspiegel.de,,"BIP sinkt um 32,9 Prozent: Historischer Konjun...",In den USA ist die Wirtschaftsleistung im zwei...,In den USA ist die Wirtschaftsleistung im zwei...,"[wirtschaft, jahr, pandemie, sinkt, 329, histo...","[-0.32075846, 0.168832, -0.19110593, -0.327525...",0
1229,https://www.tagesspiegel.de/berlin/coronavirus...,tagesspiegel.de,,Coronavirus in der Hauptstadt: Brandenburg ver...,Gesundheitssenatorin Gote erwartet Maskenpflic...,Gesundheitssenatorin Gote erwartet Maskenpflic...,"[lage, coronavirus, virusvariante, maskenpflic...","[-0.27166435, 0.086117305, -0.13985954, -0.307...",0
1832,https://www.spiegel.de/politik/deutschland/wir...,spiegel.de,,Corona: »Wir debattieren über Schulschließunge...,Während der Schulschließungen im Frühjahr wurd...,Während der Schulschließungen im Frühjahr wurd...,"[blase, corona, kinder, schulschließungen, bil...","[-0.12988614, 0.07425252, -0.13965866, -0.2225...",1


In [27]:
df0 = df[df.cluster==0]
df1 = df[df.cluster==1]
print("Cluster 0 size = "+str(len(df0)))
print("Cluster 1 size = "+str(len(df1)))

Cluster 0 size = 1628
Cluster 1 size = 1950
