### Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import pandas_profiling

### Import dataset

In [2]:
file_path = os.path.join("data", "twitter_URLs.bin")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,expanded_url,domain,tag
0,https://twitter.com/kjh_mov/status/12996294579...,twitter.com,
1,https://twitter.com/kjh_mov/status/12996294579...,twitter.com,
2,https://twitter.com/kjh_mov/status/12996294579...,twitter.com,
3,http://ntv.de,ntv.de,
4,https://twitter.com/kjh_mov/status/12996294506...,twitter.com,


# 1. Data Understanding

In [3]:
df.describe()

Unnamed: 0,expanded_url,domain,tag
count,401411,401411,27904
unique,286502,9948,9
top,https://support.twitter.com/articles/20169199,twitter.com,Youtube
freq,623,288363,25311


In [4]:
df.dtypes

expanded_url    object
domain          object
tag             object
dtype: object

In [5]:
df["domain"].value_counts()

twitter.com                    288363
youtu.be                        14964
youtube.com                     10339
welt.de                          1836
tagesspiegel.de                  1804
                                ...  
legis.ga.gov                        1
rentenreform-alternative.de         1
anthroweb.info                      1
cornelsen.de                        1
man.it                              1
Name: domain, Length: 9948, dtype: int64

In [6]:
df.profile_report()

Summarize dataset:   0%|          | 0/17 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# 2. Data Processing

### 2.1 Extract news links

In [7]:
df = df.loc[(df.domain == "welt.de") | (df.domain == "tagesspiegel.de") | (df.domain == "tagesschau.de") | (df.domain == "spiegel.de") | (df.domain == "zeit.de")]
df.describe()

Unnamed: 0,expanded_url,domain,tag
count,8101,8101,0.0
unique,3804,5,0.0
top,https://www.tagesspiegel.de/themen/reportage/q...,welt.de,
freq,131,1836,


### 2.2 drop duplicate rows and reset index

In [8]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,expanded_url,domain,tag
count,3804,3804,0.0
unique,3804,5,0.0
top,https://www.welt.de/debatte/kommentare/plus215...,spiegel.de,
freq,1,848,


### 2.3 extract title, text, summary and keywords

In [None]:
# !pip3 install newspaper3k

In [9]:
from newspaper import Article

#create new columns for article title, text, summary and keywords
df["title"] = np.nan
df["text"] = np.nan
df["summary"] = np.nan
df["keywords"] = np.nan

#extract title, text, summary and keywords for each article and copy to corresponding row-column
url_list = df.expanded_url
for i in range(0,len(url_list)):
    print("Article number: "+str(i))
    try:
        article = Article(url_list[i], language="de")
        article.download()
        article.parse()
        article.nlp()
    
        df["title"][i] = article.title
        df["text"][i] = article.text
        df["summary"][i] = article.summary
        df["keywords"][i] = article.keywords
    except Exception as e:
        print(e)

Article number: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title"][i] = article.title
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"][i] = article.text
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["summary"][i] = article.summary
A value is trying to be 

Article number: 1
Article number: 2
Article number: 3
Article number: 4
Article number: 5
Article number: 6
Article number: 7
Article number: 8
Article number: 9
Article number: 10
Article number: 11
Article number: 12
Article number: 13
Article number: 14
Article number: 15
Article number: 16
Article number: 17
Article number: 18
Article number: 19
Article number: 20
Article number: 21
Article number: 22
Article number: 23
Article number: 24
Article number: 25
Article number: 26
Article number: 27
Article number: 28
Article number: 29
Article number: 30
Article number: 31
Article number: 32
Article number: 33
Article number: 34
Article number: 35
Article number: 36
Article number: 37
Article number: 38
Article number: 39
Article number: 40
Article number: 41
Article number: 42
Article number: 43
Article number: 44
Article number: 45
Article number: 46
Article number: 47
Article number: 48
Article number: 49
Article number: 50
Article number: 51
Article number: 52
Article number: 53
Ar

Article number: 357
Article number: 358
Article number: 359
Article number: 360
Article number: 361
Article number: 362
Article number: 363
Article number: 364
Article number: 365
Article number: 366
Article number: 367
Article number: 368
Article number: 369
Article number: 370
Article number: 371
Article number: 372
Article number: 373
Article number: 374
Article number: 375
Article number: 376
Article number: 377
Article number: 378
Article number: 379
Article number: 380
Article number: 381
Article number: 382
Article number: 383
Article number: 384
Article number: 385
Article number: 386
Article number: 387
Article number: 388
Article number: 389
Article number: 390
Article number: 391
Article number: 392
Article number: 393
Article number: 394
Article number: 395
Article number: 396
Article number: 397
Article number: 398
Article number: 399
Article number: 400
Article number: 401
Article number: 402
Article number: 403
Article number: 404
Article number: 405
Article number: 406


Article number: 759
Article number: 760
Article number: 761
Article number: 762
Article number: 763
Article number: 764
Article number: 765
Article number: 766
Article number: 767
Article number: 768
Article number: 769
Article number: 770
Article number: 771
Article number: 772
Article number: 773
Article number: 774
Article number: 775
Article number: 776
Article number: 777
Article number: 778
Article number: 779
Article number: 780
Article number: 781
Article number: 782
Article number: 783
Article number: 784
Article number: 785
Article number: 786
Article number: 787
Article number: 788
Article number: 789
Article number: 790
Article number: 791
Article number: 792
Article number: 793
Article number: 794
Article number: 795
Article number: 796
Article number: 797
Article number: 798
Article number: 799
Article number: 800
Article number: 801
Article number: 802
Article number: 803
Article number: 804
Article number: 805
Article number: 806
Article number: 807
Article number: 808


Article number: 1117
Article number: 1118
Article number: 1119
Article number: 1120
Article number: 1121
Article number: 1122
Article number: 1123
Article number: 1124
Article number: 1125
Article number: 1126
Article number: 1127
Article number: 1128
Article number: 1129
Article number: 1130
Article number: 1131
Article number: 1132
Article number: 1133
Article number: 1134
Article number: 1135
Article number: 1136
Article number: 1137
Article number: 1138
Article number: 1139
Article number: 1140
Article number: 1141
Article number: 1142
Article number: 1143
Article number: 1144
Article number: 1145
Article number: 1146
Article number: 1147
Article number: 1148
Article number: 1149
Article number: 1150
Article number: 1151
Article number: 1152
Article number: 1153
Article number: 1154
Article number: 1155
Article number: 1156
Article number: 1157
Article number: 1158
Article number: 1159
Article number: 1160
Article number: 1161
Article number: 1162
Article number: 1163
Article numbe

Article number: 1485
Article number: 1486
Article number: 1487
Article number: 1488
Article number: 1489
Article number: 1490
Article number: 1491
Article number: 1492
Article number: 1493
Article number: 1494
Article number: 1495
Article number: 1496
Article number: 1497
Article number: 1498
Article number: 1499
Article number: 1500
Article number: 1501
Article number: 1502
Article number: 1503
Article number: 1504
Article number: 1505
Article number: 1506
Article number: 1507
Article number: 1508
Article number: 1509
Article number: 1510
Article number: 1511
Article number: 1512
Article number: 1513
Article number: 1514
Article number: 1515
Article number: 1516
Article number: 1517
Article number: 1518
Article number: 1519
Article number: 1520
Article number: 1521
Article number: 1522
Article number: 1523
Article number: 1524
Article number: 1525
Article number: 1526
Article number: 1527
Article number: 1528
Article number: 1529
Article number: 1530
Article number: 1531
Article `down

Article number: 1833
Article number: 1834
Article number: 1835
Article `download()` failed with 404 Client Error: Not Found for url: https://www.tagesschau.de:443/inland/privile/ on URL http://tagesschau.de/inland/privile
Article number: 1836
Article number: 1837
Article number: 1838
Article number: 1839
Article number: 1840
Article number: 1841
Article number: 1842
Article number: 1843
Article number: 1844
Article number: 1845
Article number: 1846
Article number: 1847
Article number: 1848
Article number: 1849
Article number: 1850
Article number: 1851
Article number: 1852
Article number: 1853
Article number: 1854
Article number: 1855
Article number: 1856
Article number: 1857
Article number: 1858
Article number: 1859
Article number: 1860
Article number: 1861
Article number: 1862
Article number: 1863
Article number: 1864
Article number: 1865
Article number: 1866
Article number: 1867
Article number: 1868
Article number: 1869
Article number: 1870
Article number: 1871
Article number: 1872
A

Article number: 2181
Article number: 2182
Article number: 2183
Article number: 2184
Article number: 2185
Article number: 2186
Article number: 2187
Article number: 2188
Article number: 2189
Article number: 2190
Article number: 2191
Article number: 2192
Article number: 2193
Article number: 2194
Article number: 2195
Article number: 2196
Article number: 2197
Article number: 2198
Article number: 2199
Article number: 2200
Article number: 2201
Article number: 2202
Article `download()` failed with 404 Client Error: Not Found for url: https://www.tagesspiegel.de/meinung/verunglueckte-netz-kampagne-alles-dicht-machen-ist-so-schaebig-dass-es-weh-tut/27123990.html on URL https://www.tagesspiegel.de/meinung/verunglueckte-netz-kampagne-alles-dicht-machen-ist-so-schaebig-dass-es-weh-tut/27123990.html
Article number: 2203
Article `download()` failed with 404 Client Error: Not Found for url: https://m.tagesspiegel.de/meinung/verunglueckte-netz-kampagne-alles-dicht-machen-ist-so-schaebig-dass-es-weh-tut

Article number: 2491
Article number: 2492
Article number: 2493
Article number: 2494
Article number: 2495
Article number: 2496
Article number: 2497
Article number: 2498
Article number: 2499
Article number: 2500
Article number: 2501
Article number: 2502
Article number: 2503
Article number: 2504
Article number: 2505
Article number: 2506
Article number: 2507
Article number: 2508
Article number: 2509
Article number: 2510
Article number: 2511
Article number: 2512
Article number: 2513
Article number: 2514
Article number: 2515
Article number: 2516
Article number: 2517
Article number: 2518
Article number: 2519
Article number: 2520
Article number: 2521
Article number: 2522
Article number: 2523
Article number: 2524
Article number: 2525
Article number: 2526
Article `download()` failed with 404 Client Error: Not Found for url: https://www.tagesschau.de/sport/sportschau/sportschau-story-39467.html on URL https://www.tagesschau.de/sport/sportschau/sportschau-story-39467.html
Article number: 2527
Arti

Article number: 2860
Article number: 2861
Article number: 2862
Article number: 2863
Article number: 2864
Article number: 2865
Article number: 2866
Article number: 2867
Article number: 2868
Article number: 2869
Article number: 2870
Article number: 2871
Article number: 2872
Article number: 2873
Article number: 2874
Article number: 2875
Article number: 2876
Article number: 2877
Article number: 2878
Article number: 2879
Article number: 2880
Article number: 2881
Article number: 2882
Article number: 2883
Article number: 2884
Article number: 2885
Article number: 2886
Article number: 2887
Article number: 2888
Article number: 2889
Article number: 2890
Article number: 2891
Article number: 2892
Article number: 2893
Article number: 2894
Article number: 2895
Article number: 2896
Article number: 2897
Article number: 2898
Article number: 2899
Article number: 2900
Article number: 2901
Article number: 2902
Article number: 2903
Article number: 2904
Article number: 2905
Article number: 2906
Article numbe

Article number: 3195
Article number: 3196
Article number: 3197
Article number: 3198
Article number: 3199
Article number: 3200
Article number: 3201
Article number: 3202
Article number: 3203
Article number: 3204
Article number: 3205
Article number: 3206
Article number: 3207
Article number: 3208
Article number: 3209
Article number: 3210
Article number: 3211
Article number: 3212
Article number: 3213
Article number: 3214
Article number: 3215
Article number: 3216
Article number: 3217
Article number: 3218
Article number: 3219
Article number: 3220
Article number: 3221
Article number: 3222
Article number: 3223
Article number: 3224
Article number: 3225
Article number: 3226
Article number: 3227
Article number: 3228
Article number: 3229
Article number: 3230
Article number: 3231
Article number: 3232
Article number: 3233
Article number: 3234
Article number: 3235
Article number: 3236
Article number: 3237
Article number: 3238
Article number: 3239
Article number: 3240
Article number: 3241
Article numbe

Article number: 3556
Article number: 3557
Article number: 3558
Article number: 3559
Article number: 3560
Article number: 3561
Article number: 3562
Article number: 3563
Article number: 3564
Article number: 3565
Article number: 3566
Article number: 3567
Article number: 3568
Article number: 3569
Article number: 3570
Article number: 3571
Article number: 3572
Article number: 3573
Article number: 3574
Article number: 3575
Article number: 3576
Article number: 3577
Article number: 3578
Article number: 3579
Article number: 3580
Article number: 3581
Article number: 3582
Article number: 3583
Article number: 3584
Article number: 3585
Article number: 3586
Article number: 3587
Article number: 3588
Article number: 3589
Article number: 3590
Article number: 3591
Article number: 3592
Article number: 3593
Article number: 3594
Article number: 3595
Article number: 3596
Article number: 3597
Article number: 3598
Article number: 3599
Article number: 3600
Article number: 3601
Article number: 3602
Article numbe

In [10]:
df_bkp = df.copy()
df_bkp.tail()

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords
3799,https://www.welt.de/regionales/hamburg/article...,welt.de,,Polizei startet sechste G20-Öffentlichkeitsfah...,Anzeige\n\nHamburg (dpa/lno) - Zwei Jahre nach...,AnzeigeHamburg (dpa/lno) - Zwei Jahre nach den...,"[polizei, startet, jahre, sechste, fotos, 2017..."
3800,https://www.zeit.de/gesellschaft/zeitgeschehen...,zeit.de,,Lesen Sie zeit.de mit Werbung oder im PUR-Abo....,zeit.de mit Werbung\n\nBesuchen Sie zeit.de wi...,zeit.de mit WerbungBesuchen Sie zeit.de wie ge...,"[zeitde, werbung, purabo, tracking, lesen, gew..."
3801,https://www.welt.de/politik/deutschland/articl...,welt.de,,Demonstrationen in Berlin: Polizei will aktiv ...,Dass bei Demonstrationen gegen Trumps Jerusale...,Dass bei Demonstrationen gegen Trumps Jerusale...,"[tor, israelflaggen, strafbar, verhindern, ver..."
3802,https://www.spiegel.de/panorama/justiz/us-panz...,spiegel.de,,Sachsen: Polizei stoppt Konvoi mit US-Panzern ...,Eine Streife der Verkehrspolizei hat auf der A...,Eine Streife der Verkehrspolizei hat auf der A...,"[gefehlt, stoppt, konvoi, erforderlichen, sach..."
3803,https://www.tagesspiegel.de/images/fdp_keinesa...,tagesspiegel.de,,,,,[]


In [11]:
df.isna().sum()

expanded_url       0
domain             0
tag             3804
title             28
text              28
summary           28
keywords          28
dtype: int64

### 2.4 removing record if keywords field is null

In [12]:
#remove all rows which has no keywords and reset index
df = df[~df['keywords'].isnull()]
df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords
count,3776,3776,0.0,3776,3776,3776,3776
unique,3776,5,0.0,2126,2094,2079,2192
top,https://www.welt.de/debatte/kommentare/plus215...,spiegel.de,,Lesen Sie zeit.de mit Werbung oder im PUR-Abo....,zeit.de mit Werbung\n\nBesuchen Sie zeit.de wi...,zeit.de mit WerbungBesuchen Sie zeit.de wie ge...,"[zeitde, werbung, purabo, tracking, lesen, gew..."
freq,1,838,,640,640,640,640


In [13]:
df.isna().sum()

expanded_url       0
domain             0
tag             3776
title              0
text               0
summary            0
keywords           0
dtype: int64

In [14]:
df.expanded_url.nunique()

3776

### 2.5 removing rows with .pdf/.jpg/.png links

In [15]:
#add code for removing pdf links
df_bkp_2 = df.copy()

df=df[~((df.expanded_url.str.endswith(".pdf")) | (df.expanded_url.str.endswith(".jpg")) | (df.expanded_url.str.endswith(".png")))]
#reset index
df = df.reset_index(drop=True)
df.describe()

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords
count,3760,3760,0.0,3760,3760,3760,3760
unique,3760,5,0.0,2125,2094,2079,2191
top,https://www.welt.de/debatte/kommentare/plus215...,spiegel.de,,Lesen Sie zeit.de mit Werbung oder im PUR-Abo....,zeit.de mit Werbung\n\nBesuchen Sie zeit.de wi...,zeit.de mit WerbungBesuchen Sie zeit.de wie ge...,"[zeitde, werbung, purabo, tracking, lesen, gew..."
freq,1,838,,639,639,639,639


In [16]:
df.isna().sum()

expanded_url       0
domain             0
tag             3760
title              0
text               0
summary            0
keywords           0
dtype: int64

### 2.6 load library - word2vec

In [None]:
# !pip install --upgrade gensim

In [17]:
#load word2vec
from gensim.models import KeyedVectors
vecs = KeyedVectors.load_word2vec_format('data/wiki.de.vec', binary=False)

### 2.7 calculate vector average

In [18]:
df["vec_sum_avg"] = np.array

for i in range(len(df["vec_sum_avg"])):    
    print("******* Article number: "+str(i)+" **********")
    num = 0
    vec_sum = 0
    
    for j in range(len(df["keywords"][i])):
        try:
            vec_temp = vecs.word_vec(df["keywords"][i][j], use_norm=False)        
        except Exception as e:
            print(e)        
        else:
            num = num + 1
            vec_sum = vec_sum + vec_temp
            
    try:
        vec_avg = vec_sum/num
        df["vec_sum_avg"][i] = vec_avg
    except Exception as e:        
        print(e)    

******* Article number: 0 **********
"word 'asylzuwanderern' not in vocabulary"
******* Article number: 1 **********
******* Article number: 2 **********
"word 'zdfpolitbarometer' not in vocabulary"
"word '40' not in vocabulary"
******* Article number: 3 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 4 **********
"word 'seisselberg' not in vocabulary"
******* Article number: 5 **********
"word 'aidspandemie' not in vocabulary"
"word 'coronakrise' not in vocabulary"
"word 'aidskrise' not in vocabulary"
"word 'covid19' not in vocabulary"
******* Article number: 6 **********
******* Article number: 7 **********
"word 'afdflügel' not in vocabulary"
******* Article number: 8 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 9 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabul

******* Article number: 143 **********
"word 'coronademo' not in vocabulary"
******* Article number: 144 **********
"word 'coronakrise' not in vocabulary"
******* Article number: 145 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 146 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 147 **********
"word 'elitenverachtung' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
******* Article number: 148 **********
"word 'weiterlesenklicken' not in vocabulary"
"word 'werdenhier' not in vocabulary"
"word 'spiegelzugang' not in vocabulary"
"word 'genutztspiegel' not in vocabulary"
******* Article number: 149 **********
******* Article number: 150 **********
"word '908000' not in vocabulary"
"word 'krebsops' not in vocabulary"
******* Article number: 151 **********
"word '22500' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
"word '17' not in voc

******* Article number: 292 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 293 **********
"word 'nationalstolzdebatte' not in vocabulary"
******* Article number: 294 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 295 **********
******* Article number: 296 **********
******* Article number: 297 **********
"word 'spdwebsite' not in vocabulary"
******* Article number: 298 **********
"word 'coronapandemie' not in vocabulary"
"word 'coronakrise' not in vocabulary"
******* Article number: 299 **********
"word 'antirassismusdemo' not in vocabulary"
"word 'superspreadingevent' not in vocabulary"
******* Article number: 300 **********
"word 'laienjustiz' not in vocabulary"
******* Article number: 301 **********
"word 'piratenpolitikerin' not in vocabulary"
******* Article number: 302 **********
"word

******* Article number: 514 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 515 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 516 **********
"word '3439' not in vocabulary"
"word 'siebentageinzidenz' not in vocabulary"
"word 'todesfälleunter' not in vocabulary"
"word 'intensivbettenampel' not in vocabulary"
******* Article number: 517 **********
"word 'coronavirusverdacht' not in vocabulary"
******* Article number: 518 **********
"word 'coronapandemie' not in vocabulary"
"word 'hochzeitenso' not in vocabulary"
******* Article number: 519 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 520 **********
"word 'schulstudie' not in vocabulary"
"word 'rachenabstriche' not in vocabulary"
******* Article number: 521 **********
"word 'tlymphozyten' not in vocabulary"
"word

******* Article number: 701 **********
"word 'coronaangst' not in vocabulary"
******* Article number: 702 **********
"word 'klimaaktivisten' not in vocabulary"
******* Article number: 703 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 704 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 705 **********
******* Article number: 706 **********
******* Article number: 707 **********
"word 'coronapandemie' not in vocabulary"
"word 'faktenfinder' not in vocabulary"
"word 'ardfaktenfinder' not in vocabulary"
******* Article number: 708 **********
******* Article number: 709 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 710 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 711 

******* Article number: 809 **********
******* Article number: 810 **********
"word 'coronademo' not in vocabulary"
"word 'coronapolitik' not in vocabulary"
******* Article number: 811 **********
"word 'demoverbot' not in vocabulary"
******* Article number: 812 **********
"word 'greenpeaceaktivisten' not in vocabulary"
"word '2038' not in vocabulary"
******* Article number: 813 **********
"word '308' not in vocabulary"
"word 'maskenpflicht' not in vocabulary"
"word 'coronavirusnews' not in vocabulary"
******* Article number: 814 **********
"word 'reichstagstreppe' not in vocabulary"
******* Article number: 815 **********
"word 'coronademonstration' not in vocabulary"
******* Article number: 816 **********
"word 'coronademo' not in vocabulary"
"word 'demoblog' not in vocabulary"
******* Article number: 817 **********
"word '200' not in vocabulary"
"word 'überfordertpolitiker' not in vocabulary"
"word 'reichstagstreppe' not in vocabulary"
"word 'coronagroßdemo' not in vocabulary"
*******

******* Article number: 1045 **********
"word 'coronademonstration' not in vocabulary"
******* Article number: 1046 **********
"word 'coronainfektion' not in vocabulary"
"word 'robertkochinstitut' not in vocabulary"
"word 'covid19' not in vocabulary"
******* Article number: 1047 **********
"word 'coronademonstration' not in vocabulary"
******* Article number: 1048 **********
"word 'coronademo' not in vocabulary"
"word 'freiheitlichdemokratischen' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
******* Article number: 1049 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1050 **********
"word '160' not in vocabulary"
******* Article number: 1051 **********
"word 'coronamaßnahmen' not in vocabulary"
"word 'coronademonstrationen' not in vocabulary"
******* Article number: 1052 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' no

******* Article number: 1296 **********
"word '3439' not in vocabulary"
"word 'siebentageinzidenz' not in vocabulary"
"word 'todesfälleunter' not in vocabulary"
"word 'intensivbettenampel' not in vocabulary"
******* Article number: 1297 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1298 **********
"word 'politikerranking' not in vocabulary"
"word 'coronakrise' not in vocabulary"
******* Article number: 1299 **********
"word 'coronapandemie' not in vocabulary"
"word 'hochzeitenso' not in vocabulary"
******* Article number: 1300 **********
"word '3230' not in vocabulary"
******* Article number: 1301 **********
"word 'coronaviruspandemie' not in vocabulary"
******* Article number: 1302 **********
"word 'qanon' not in vocabulary"
"word 'verschwörungsgläubigen' not in vocabulary"
"word 'qanonanhänger' not in vocabulary"
******* Article number: 1303 **********
"word '2' not in vocabulary"
"wor

******* Article number: 1439 **********
******* Article number: 1440 **********
"word 'passiertim' not in vocabulary"
"word 'isrückkehrer' not in vocabulary"
******* Article number: 1441 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1442 **********
"word 'covid19toten' not in vocabulary"
"word '97' not in vocabulary"
"word 'coronavirustoten' not in vocabulary"
"word 'covid19' not in vocabulary"
******* Article number: 1443 **********
"word 'saibous' not in vocabulary"
"word 'coronademo' not in vocabulary"
******* Article number: 1444 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1445 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1446 **********
"word 'wollenvon' not in vocabulary"
"word 'sta

******* Article number: 1607 **********
"word '1918' not in vocabulary"
"word '1917' not in vocabulary"
******* Article number: 1608 **********
******* Article number: 1609 **********
"word 'covid19' not in vocabulary"
******* Article number: 1610 **********
"word '40000' not in vocabulary"
"word '10000' not in vocabulary"
******* Article number: 1611 **********
******* Article number: 1612 **********
"word 'frontkämpferarzt' not in vocabulary"
******* Article number: 1613 **********
******* Article number: 1614 **********
******* Article number: 1615 **********
"word 'coronamaßnahmen' not in vocabulary"
******* Article number: 1616 **********
******* Article number: 1617 **********
"word 'terrassenplätze' not in vocabulary"
"word 'restaurantsterben' not in vocabulary"
"word 'heizpilze' not in vocabulary"
******* Article number: 1618 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1619 ***

"word 'unternehmertumsein' not in vocabulary"
"word 'maskenproduktion' not in vocabulary"
"word 'whatsappnachrichten' not in vocabulary"
******* Article number: 1803 **********
"word '10' not in vocabulary"
******* Article number: 1804 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1805 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1806 **********
******* Article number: 1807 **********
******* Article number: 1808 **********
"word 'uswahlrecht' not in vocabulary"
******* Article number: 1809 **********
"word 'medikamentenklau' not in vocabulary"
******* Article number: 1810 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 1811 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in v

******* Article number: 1947 **********
******* Article number: 1948 **********
"word '30000' not in vocabulary"
******* Article number: 1949 **********
******* Article number: 1950 **********
"word 'bertelsmannstudie' not in vocabulary"
"word 'spdexperte' not in vocabulary"
******* Article number: 1951 **********
"word 'coronabeschränkungen' not in vocabulary"
******* Article number: 1952 **********
"word 'allesdichtmachen' not in vocabulary"
"word 'youtubeum' not in vocabulary"
******* Article number: 1953 **********
"word 'weiterlesenklicken' not in vocabulary"
"word 'werdenhier' not in vocabulary"
"word 'spiegelzugang' not in vocabulary"
"word 'genutztspiegel' not in vocabulary"
******* Article number: 1954 **********
"word 'coronakritik' not in vocabulary"
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 1955 **********
"word 'mdrrauswurf' not in vocabulary"
******* Article number: 1956 **********
******* Article number: 1957 **********
"word 'zeitde' not in voc

******* Article number: 2047 **********
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 2048 **********
"word 'ardsatire' not in vocabulary"
******* Article number: 2049 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2050 **********
"word 'westenindien' not in vocabulary"
******* Article number: 2051 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2052 **********
"word 'iwfchefin' not in vocabulary"
"word '2007' not in vocabulary"
******* Article number: 2053 **********
"word 'polizeimitarbeiterin' not in vocabulary"
******* Article number: 2054 **********
"word 'coronaopfern' not in vocabulary"
******* Article number: 2055 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 2056 **********
"word 'ausbildungsgehälter' not in vocabulary"
"word 'mind

******* Article number: 2167 **********
"word 'siebentageinzidenz' not in vocabulary"
******* Article number: 2168 **********
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 2169 **********
"word 'allesschlichtmachen' not in vocabulary"
******* Article number: 2170 **********
******* Article number: 2171 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2172 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2173 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2174 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2175 **********
"word 'zahlenkosmetik' not in vocabulary"
******* Article n

******* Article number: 2365 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 2366 **********
******* Article number: 2367 **********
"word 'sterbefallzahlen' not in vocabulary"
******* Article number: 2368 **********
"word 'coronakrise' not in vocabulary"
******* Article number: 2369 **********
******* Article number: 2370 **********
"word 'doppelmutante' not in vocabulary"
******* Article number: 2371 **********
******* Article number: 2372 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2373 **********
******* Article number: 2374 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2375 **********
"word 'winterwelle' not in vocabulary"
******* Article number: 2376 **********
"word 'lockdowns' not in vocabulary"
******* Article number: 2377 **********
******* Article 

"word 'fdjvergangenheit' not in vocabulary"
******* Article number: 2523 **********
"word 'stattdie' not in vocabulary"
******* Article number: 2524 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2525 **********
"word 'lockdowns' not in vocabulary"
"word 'wissenschaftsleugnung' not in vocabulary"
******* Article number: 2526 **********
"word 'unglücklichder' not in vocabulary"
"word 'ändernerzbistum' not in vocabulary"
******* Article number: 2527 **********
"word 'querdenkendemos' not in vocabulary"
"word '21000' not in vocabulary"
******* Article number: 2528 **********
"word 'coronakrise' not in vocabulary"
******* Article number: 2529 **********
"word 'biontech' not in vocabulary"
"word 'b1351' not in vocabulary"
******* Article number: 2530 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number:

"word 'genutztspiegel' not in vocabulary"
******* Article number: 2661 **********
"word 'milliardengewinn' not in vocabulary"
******* Article number: 2662 **********
"word 'tabakverbot' not in vocabulary"
******* Article number: 2663 **********
"word 'überhauptbayern' not in vocabulary"
"word 'teilimpfpflicht' not in vocabulary"
******* Article number: 2664 **********
"word 'coronalage' not in vocabulary"
******* Article number: 2665 **********
******* Article number: 2666 **********
"word 'winterwelle' not in vocabulary"
******* Article number: 2667 **********
"word 'querdenkenbewegung' not in vocabulary"
"word 'querdenkerdemo' not in vocabulary"
******* Article number: 2668 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2669 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2670 **********
"w

******* Article number: 2817 **********
"word '32' not in vocabulary"
******* Article number: 2818 **********
"word 'coronatests' not in vocabulary"
******* Article number: 2819 **********
"word 'covid19' not in vocabulary"
******* Article number: 2820 **********
"word 'csupolitiker' not in vocabulary"
"word 'coronaschnelltest' not in vocabulary"
"word '300000' not in vocabulary"
******* Article number: 2821 **********
"word 'coronahotspot' not in vocabulary"
******* Article number: 2822 **********
******* Article number: 2823 **********
"word 'miniherbergen' not in vocabulary"
******* Article number: 2824 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2825 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2826 **********
"word 'maskenpflicht' not in vocabulary"
"word 'coronaregeln' not in voca

******* Article number: 2984 **********
"word 'vätermonate' not in vocabulary"
******* Article number: 2985 **********
******* Article number: 2986 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2987 **********
"word 'ausrüstungsmangel' not in vocabulary"
******* Article number: 2988 **********
******* Article number: 2989 **********
******* Article number: 2990 **********
"word 'maskenpflicht' not in vocabulary"
******* Article number: 2991 **********
"word 'coronamaßnahmen' not in vocabulary"
******* Article number: 2992 **********
"word 'coronainfektion' not in vocabulary"
******* Article number: 2993 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 2994 **********
"word 'selbstentlastungszeuge' not in vocabulary"
"word 'wirecarduntersuchungsausschuss' not in vocabulary"
******* Article num

******* Article number: 3085 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3086 **********
"word 'querdenkerdemos' not in vocabulary"
"word 'bundesnotbremse' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
"word 'zusammengetragentagesspiegelunser' not in vocabulary"
"word '250' not in vocabulary"
******* Article number: 3087 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3088 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3089 **********
"word 'querdenkerdemos' not in vocabulary"
"word 'bundesnotbremse' not in vocabulary"
"word 'coronaproteste' not in vocabulary"
"word 'zusammengetragentagesspiegelunser' not in vocabulary"
"word '250' not in vocabulary"
******* Article number: 309

******* Article number: 3172 **********
"word 'coronademos' not in vocabulary"
"word 'querdenkerprotest' not in vocabulary"
******* Article number: 3173 **********
"word 'coronademos' not in vocabulary"
"word 'querdenkerprotest' not in vocabulary"
******* Article number: 3174 **********
"word 'siebentageinzidenz' not in vocabulary"
"word 'rkiprognose' not in vocabulary"
******* Article number: 3175 **********
"word 'querdenkendemos' not in vocabulary"
******* Article number: 3176 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3177 **********
"word 'winterwelle' not in vocabulary"
******* Article number: 3178 **********
"word 'netzwerkertelegram' not in vocabulary"
"word 'telegramgruppen' not in vocabulary"
"word 'tonlinerecherchen' not in vocabulary"
******* Article number: 3179 **********
"word 'querdenkendemos' not in vocabulary"
******* Article number: 3180 **********
"word 'querdenken

******* Article number: 3279 **********
"word 'coronapandemie' not in vocabulary"
"word 'hochzeitenso' not in vocabulary"
******* Article number: 3280 **********
"word 'coronapandemie' not in vocabulary"
"word 'hochzeitenso' not in vocabulary"
******* Article number: 3281 **********
"word 'oscarbeitrager' not in vocabulary"
"word 'sciencefictionfilm' not in vocabulary"
"word 'oscarbewerbung' not in vocabulary"
******* Article number: 3282 **********
******* Article number: 3283 **********
"word 'impfgipfel' not in vocabulary"
******* Article number: 3284 **********
"word 'coronaschulpolitik' not in vocabulary"
******* Article number: 3285 **********
"word 'coronapolitik' not in vocabulary"
******* Article number: 3286 **********
"word '130000' not in vocabulary"
******* Article number: 3287 **********
"word 'lucaapp' not in vocabulary"
"word 'kontaktnachverfolgung' not in vocabulary"
"word 'lucasystem' not in vocabulary"
******* Article number: 3288 **********
"word 'impfgipfel' not in

******* Article number: 3405 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3406 **********
"word 'pandemiebeginn' not in vocabulary"
"word '100000' not in vocabulary"
******* Article number: 3407 **********
"word 'verfassungabtreibungen' not in vocabulary"
"word 'usstaaten' not in vocabulary"
******* Article number: 3408 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werbungbesuchen' not in vocabulary"
******* Article number: 3409 **********
"word 'coronaopfer' not in vocabulary"
******* Article number: 3410 **********
"word 'triageregelung' not in vocabulary"
******* Article number: 3411 **********
"word 'coronahochburg' not in vocabulary"
******* Article number: 3412 **********
"word 'exfdppolitiker' not in vocabulary"
******* Article number: 3413 **********
"word 'maasmitarbeiterin' not in vocabulary"
******* Article number: 3414 **********
"word

******* Article number: 3514 **********
"word '19jährigen' not in vocabulary"
"word 'qosay' not in vocabulary"
******* Article number: 3515 **********
"word 'tagesschaude' not in vocabulary"
******* Article number: 3516 **********
******* Article number: 3517 **********
"word 'kinderhass' not in vocabulary"
******* Article number: 3518 **********
******* Article number: 3519 **********
"word 'querdenkendemo' not in vocabulary"
******* Article number: 3520 **********
"word 'querdenkenbewegung' not in vocabulary"
******* Article number: 3521 **********
******* Article number: 3522 **********
"word 'coronaschwindel' not in vocabulary"
"word 'verschwörungslegenden' not in vocabulary"
******* Article number: 3523 **********
"word 'genderdoppelpunkt' not in vocabulary"
"word 'gendersternchen' not in vocabulary"
******* Article number: 3524 **********
"word 'allesdichtmachen' not in vocabulary"
******* Article number: 3525 **********
"word 'querdenkerpartei' not in vocabulary"
"word 'allesdic

******* Article number: 3615 **********
******* Article number: 3616 **********
******* Article number: 3617 **********
"word 'coronawelle' not in vocabulary"
"word 'intensivpflegekräfte' not in vocabulary"
******* Article number: 3618 **********
"word 'maskenurteil' not in vocabulary"
"word 'maskenpflicht' not in vocabulary"
******* Article number: 3619 **********
******* Article number: 3620 **********
"word 'infektionsausbruch' not in vocabulary"
"word 'r1' not in vocabulary"
"word 'uspflegeheim' not in vocabulary"
******* Article number: 3621 **********
"word '200' not in vocabulary"
"word 'coronamaßnahmen' not in vocabulary"
"word 'schulnotbremse' not in vocabulary"
******* Article number: 3622 **********
"word '11644' not in vocabulary"
"word 'coronaneuinfektionen' not in vocabulary"
******* Article number: 3623 **********
******* Article number: 3624 **********
******* Article number: 3625 **********
"word 'zeitde' not in vocabulary"
"word 'purabo' not in vocabulary"
"word 'werb

******* Article number: 3716 **********
******* Article number: 3717 **********
"word 'loveparadekatastrophe' not in vocabulary"
******* Article number: 3718 **********
"word '274441' not in vocabulary"
******* Article number: 3719 **********
"word 'coronamaßnahmen' not in vocabulary"
"word 'querfrontdemonstration' not in vocabulary"
******* Article number: 3720 **********
******* Article number: 3721 **********
"word 'coronakrise' not in vocabulary"
******* Article number: 3722 **********
"word 'afdabgeordneten' not in vocabulary"
******* Article number: 3723 **********
"word 'coronademo' not in vocabulary"
"word 'demoblog' not in vocabulary"
******* Article number: 3724 **********
"word 'klimastreik' not in vocabulary"
******* Article number: 3725 **********
"word 'coronapolitik' not in vocabulary"
******* Article number: 3726 **********
"word '2012' not in vocabulary"
******* Article number: 3727 **********
"word 'coronabeschränkungen' not in vocabulary"
******* Article number: 3728

In [19]:
df.isna().sum()

expanded_url       0
domain             0
tag             3760
title              0
text               0
summary            0
keywords           0
vec_sum_avg        0
dtype: int64

In [20]:
df.sample(5)

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords,vec_sum_avg
1947,https://www.welt.de/wissenschaft/article230571...,welt.de,,Intensivstationen: „Wirklich dramatisch ist di...,Der Chef der Krankenhauskette Helios schätzt d...,Der Chef der Krankenhauskette Helios schätzt d...,"[aktuell, patienten, volle, lage, krankenhäuse...","[-0.16360557, -0.07560542, -0.12044308, -0.259..."
3017,https://www.tagesschau.de/inland/masken-entsch...,tagesschau.de,,"Entscheidungen zu Masken an Schulen ""Querdenke...","Entscheidungen zu Masken an Schulen ""Querdenke...","Entscheidungen zu Masken an Schulen ""Querdenke...","[eltern, weimar, richter, maskenpflicht, schul...","[-0.17733024, 0.25959465, -0.18276946, -0.3546..."
2083,https://www.zeit.de/politik/deutschland/2019-0...,zeit.de,,Lesen Sie zeit.de mit Werbung oder im PUR-Abo....,zeit.de mit Werbung\n\nBesuchen Sie zeit.de wi...,zeit.de mit WerbungBesuchen Sie zeit.de wie ge...,"[zeitde, werbung, purabo, tracking, lesen, gew...","[-0.07083344, 0.20493168, -0.09930655, -0.1599..."
2265,https://www.tagesschau.de/ausland/china-corona...,tagesschau.de,,China - Aktuelle Nachrichten,"Trockenheit um Peking Wasser, das nicht von se...","Trockenheit um Peking Wasser, das nicht von se...","[trockenheit, wasser, trockensten, versorgen, ...","[-0.32223552, 0.09758539, -0.24361116, -0.3794..."
1391,https://www.welt.de/politik/deutschland/articl...,welt.de,,Nawalny: Gregor Gysi verdächtigt Nord-Stream- ...,Während viele deutsche Politiker nach der Verg...,Während viele deutsche Politiker nach der Verg...,"[2gegner, russland, verdächtigt, regierung, na...","[-0.29391983, 0.29842094, -0.3905137, -0.58633..."


# 3. Modeling

In [21]:
from sklearn.cluster import KMeans

X = pd.DataFrame(df['vec_sum_avg'].to_list())
clusters = KMeans(n_clusters=2, random_state=0).fit_predict(X)

In [22]:
df['cluster'] = clusters
df.sample(5)

Unnamed: 0,expanded_url,domain,tag,title,text,summary,keywords,vec_sum_avg,cluster
2844,https://www.zeit.de/politik/ausland/2021-04/au...,zeit.de,,Lesen Sie zeit.de mit Werbung oder im PUR-Abo....,zeit.de mit Werbung\n\nBesuchen Sie zeit.de wi...,zeit.de mit WerbungBesuchen Sie zeit.de wie ge...,"[zeitde, werbung, purabo, tracking, lesen, gew...","[-0.07083344, 0.20493168, -0.09930655, -0.1599...",0
2523,https://www.tagesschau.de/ausland/europa/egmr-...,tagesschau.de,,EGMR - Aktuelle Nachrichten,Zerschlagung des Konzerns verletzte Grundrecht...,Zerschlagung des Konzerns verletzte Grundrecht...,"[trotzdem, verletzte, russische, unternehmens,...","[-0.09265971, 0.18524776, -0.22347648, -0.4096...",1
398,https://m.tagesspiegel.de/politik/anordnung-vo...,tagesspiegel.de,,Anordnung vom Gesundheitsamt: Kinder sollen be...,Ein Brief des Kommunalverbands verängstigt Elt...,In einem Schreiben des Kommunalverbands Region...,"[schreiben, anordnung, hannover, isoliert, sie...","[-0.19349097, 0.1434053, -0.14320543, -0.33538...",1
3224,http://www.tagesschau.de/faktenfinder/querdenk...,tagesschau.de,,faktenfinder - Aktuelle Nachrichten,Fake News in Deutschland Schneller als die Pol...,Fake News in Deutschland Schneller als die Pol...,"[mord, terrorzelle, faktenfinder, patrick, akt...","[-0.19024982, 0.14209354, -0.3217902, -0.26749...",1
97,https://www.tagesschau.de/eilmeldung/corona-de...,tagesschau.de,,Coronavirus - Aktuelle Nachrichten,Nach Urteil aus Karlsruhe Triage-Regelung noch...,Nach Urteil aus Karlsruhe Triage-Regelung noch...,"[gesundheitsminister, offenbar, entwurf, aktue...","[-0.12850012, 0.18998758, -0.13915099, -0.3537...",1


In [23]:
df0 = df[df.cluster==0]
df1 = df[df.cluster==1]
print("Cluster 0 size = "+str(len(df0)))
print("Cluster 1 size = "+str(len(df1)))

Cluster 0 size = 643
Cluster 1 size = 3117


In [24]:
df.profile_report()

Summarize dataset:   0%|          | 0/23 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

