# Page Classfication

The goal of this notebook is to classify pages into 2 classes - clima-related; non-clima based on cca 20-40 last posts. Initialy there were 10 pro-clima (believe in climate change) pages and 10 anti-clima pages (not believing in climate change).

In [1]:
import json
import spacy
from collections import Counter
from tqdm.notebook import tqdm
import random

### Load pages splitted by the root page class affinity

In [3]:
pos_pages = [i.strip().split('/')[-1] 
             for i in open('/home/spaceape/projects/kaca-dp/data/split_positive_uniq.csv').readlines()]
neg_pages = [i.strip().split('/')[-1] 
             for i in open('/home/spaceape/projects/kaca-dp/data/split_negative_uniq.csv').readlines()]

In [4]:
print(f"positive: {len(pos_pages)}\nnegative: {len(neg_pages)}")

positive: 1819
negative: 1936


In [5]:
pos = ['NASAClimateChange',
       'acespace', 'greenpeace.international',
       'ClimateChangeIsReal', 'climatereality',
       'ClimateChangeNews', 'NatureClimateChange',
       'ClimateChangeCauses', 'GlobalwarmingEva']

neg = ['cfact',
       'iloveco2', 'ClimateChangeLIES',
       'australianclimatemadness', 'ClimateDepot',
       'ccdispatch', 'TheCO2Coalition',
       'Global-Warming-Climate-Change-whatever-its-called-is-a-scam-241741379521',
       'Global-Climate-Scam-1392664020945768']

selected_pages = pos + neg

def append_related(source, target):
    for i in source.split(';'):
        if i.strip():
            target.append(i.strip())
    return target

with open('data/data1.csv', 'r') as f1:
    for i, line in enumerate(f1.readlines()):
        init_page = line.split(';', 1)[0].strip()
        if init_page in selected_pages:
            if line.strip():
                selected_pages.append(line.strip())
            selected_pages = append_related(line, selected_pages)

with open('data/data2-uniq.csv', 'r') as f2:
    for i, line in enumerate(f2.readlines()):
        init_page = line.split(';', 1)[0].strip()
        if init_page in selected_pages:
            if line.strip():
                selected_pages.append(line.strip())
            selected_pages = append_related(line, selected_pages)

In [6]:
selected_pages = sorted(list(set(selected_pages)))

In [7]:
len(selected_pages)

3635

In [8]:
print(f"positive: {len(pos_pages)}    negative: {len(neg_pages)}")

positive: 1819    negative: 1936


### Load posts

In [10]:
posts0 = json.load(open('/home/spaceape/projects/kaca-dp/data/data-posts-0.json'))
posts1 = json.load(open('/home/spaceape/projects/kaca-dp/data/data-posts-1.json'))

posts2_pos = json.load(open('/home/spaceape/projects/kaca-dp/data/data-posts-pos-2.json'))
posts2_neg = json.load(open('/home/spaceape/projects/kaca-dp/data/data-posts-neg-2.json'))

In [11]:
posts0.keys()

dict_keys(['cfact', 'ClimateChangeIsNatural', 'iloveco2', 'ClimateChangeLIES', 'australianclimatemadness', 'ClimateDepot', 'ccdispatch', 'TheCO2Coalition', 'Global-Warming-Climate-Change-whatever-its-called-is-a-scam-241741379521', 'Global-Climate-Scam-1392664020945768', 'greenpeace.international', 'NASAClimateChange', 'bloomberggreen', 'acespace', 'ClimateChangeIsReal', 'climatereality', 'ClimateChangeNews', 'NatureClimateChange', 'ClimateChangeCauses', 'GlobalwarmingEva'])

In [12]:
for k, v in posts0.items():
    print(f"{k}: posts: {len(v)} words-rough: {len([ii for i in v for ii in i.split()])}")

cfact: posts: 34 words-rough: 22292
ClimateChangeIsNatural: posts: 36 words-rough: 1349
iloveco2: posts: 33 words-rough: 3256
ClimateChangeLIES: posts: 37 words-rough: 1211
australianclimatemadness: posts: 38 words-rough: 4085
ClimateDepot: posts: 45 words-rough: 9188
ccdispatch: posts: 38 words-rough: 694
TheCO2Coalition: posts: 38 words-rough: 4894
Global-Warming-Climate-Change-whatever-its-called-is-a-scam-241741379521: posts: 37 words-rough: 467
Global-Climate-Scam-1392664020945768: posts: 38 words-rough: 619
greenpeace.international: posts: 40 words-rough: 1954
NASAClimateChange: posts: 38 words-rough: 2073
bloomberggreen: posts: 38 words-rough: 1218
acespace: posts: 26 words-rough: 1493
ClimateChangeIsReal: posts: 39 words-rough: 6226
climatereality: posts: 38 words-rough: 1230
ClimateChangeNews: posts: 38 words-rough: 1712
NatureClimateChange: posts: 38 words-rough: 1697
ClimateChangeCauses: posts: 37 words-rough: 867
GlobalwarmingEva: posts: 36 words-rough: 723


### Load NLP engine

In [13]:
nlp = spacy.load("en_core_web_lg")

In [14]:
### Illustration
i = nlp('the a and')
for ii in i:
    print(ii, type(ii), ii.is_stop, str(ii) in nlp.Defaults.stop_words)

the <class 'spacy.tokens.token.Token'> False True
a <class 'spacy.tokens.token.Token'> False True
and <class 'spacy.tokens.token.Token'> False True


In [15]:
def lemmatize(posts):  
    lemmatized = {k: [token.lemma_ 
                       for token in nlp(". ".join(v)) 
                       if not str(token) in nlp.Defaults.stop_words and 
                       not token.is_punct and not token.is_space] 
                   for k, v in tqdm(posts.items())}
    return lemmatized

### Remove stopwords, punctuation, spaces and lemmatize all the posts 

In [16]:
lemmatized0 = lemmatize(posts0)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
lemmatized1 = lemmatize(posts1)

In [None]:
lemmatized2_pos = lemmatize(posts2_pos)

In [None]:
lemmatized2_neg = lemmatize(posts2_neg)

In [18]:
lemmatized0

{'cfact': ['joe',
  'biden',
  'kamala',
  'harris',
  'aoc',
  'democrat',
  'party',
  'us',
  'environmentalist',
  'commit',
  'make',
  'climate',
  'change',
  'green',
  'new',
  'deal',
  'replacement',
  'fossil',
  'fuel',
  'wind',
  'solar',
  'battery',
  'biofuel',
  'power',
  'centerpiece',
  'foreign',
  'domestic',
  'policy',
  'share',
  'fact',
  'cfact.org',
  '-PRON-',
  'claim',
  'transition',
  'easy',
  'affordable',
  'ecological',
  'sustainable',
  'painless',
  'that',
  '’',
  'ideology',
  'fantasy',
  'reality',
  'wind',
  'sunshine',
  'certainly',
  'clean',
  'renewable',
  'harness',
  'power',
  'america',
  'the',
  'gnd',
  'hit',
  'american',
  'family',
  'job',
  'living',
  'standard',
  'environmental',
  'quality',
  'hard',
  'western',
  'state',
  'feel',
  'brunt',
  'fossil',
  'fuel',
  'rent',
  'royalty',
  'job',
  'tax',
  'receipt',
  'disappear',
  'drilling',
  'fracking',
  'coal',
  'mining',
  'federal',
  'land',
  'clos

### Count all the lemmas (words) per page and Sum of all words per page

In [19]:
counters0 = {k: Counter(v) for k, v in lemmatized0.items()}
counters1 = {k: Counter(v) for k, v in lemmatized1.items()}
counters_pos2 = {k: Counter(v) for k, v in lemmatized2_pos.items()}
counters_neg2 = {k: Counter(v) for k, v in lemmatized2_neg.items()}

In [20]:
count_words0 = {k: sum([len(vv.split()) for vv in v]) for k, v in lemmatized0.items()}
count_words1 = {k: sum([len(vv.split()) for vv in v]) for k, v in lemmatized1.items()}
count_words_pos2 = {k: sum([len(vv.split()) for vv in v]) for k, v in lemmatized2_pos.items()}
count_words_neg2 = {k: sum([len(vv.split()) for vv in v]) for k, v in lemmatized2_neg.items()}

In [21]:
count_words0

{'cfact': 13069,
 'ClimateChangeIsNatural': 981,
 'iloveco2': 2067,
 'ClimateChangeLIES': 859,
 'australianclimatemadness': 2613,
 'ClimateDepot': 5535,
 'ccdispatch': 708,
 'TheCO2Coalition': 3437,
 'Global-Warming-Climate-Change-whatever-its-called-is-a-scam-241741379521': 298,
 'Global-Climate-Scam-1392664020945768': 530,
 'greenpeace.international': 1337,
 'NASAClimateChange': 1553,
 'bloomberggreen': 893,
 'acespace': 986,
 'ClimateChangeIsReal': 3889,
 'climatereality': 841,
 'ClimateChangeNews': 1160,
 'NatureClimateChange': 1201,
 'ClimateChangeCauses': 591,
 'GlobalwarmingEva': 476}

## Calculate scores 
### simple score = how much is a page related to clima based on the last posts

Jake prumerne/medianove skore maji inicialni stranky, ktere uz mame olabelovane --
muze se ukazat, ze negativni stranky vykazuji obecne nizsi skore a pak se jich mene zahrne do souboru v nasledne selekci (coz pozorujeme! cca 50 neg vs. 250 pos)

In [22]:
keywords = ['climate', 'co2', 'change', 'warming', ]

In [23]:
def sort_by_score(l):
    return sorted(l, key=lambda x: x[1], reverse=True)

In [24]:
import statistics as stat

In [25]:
print((f"median word count: {stat.median(sorted([v for k, v in count_words0.items()]))}\n" + 
      f"mean word count: {sum([v for k,v in count_words0.items()]) / len(count_words0)}"))

median word count: 1073.0
mean word count: 2151.2


In [26]:
print((f"median wc pos: {stat.median(sorted([v for k, v in count_words0.items() if k in pos]))}\n" + 
      f"mean wc pos: {sum([v for k,v in count_words0.items() if k in pos]) / len([k for k,v in count_words0.items() if k in pos])}"))

median wc pos: 1160
mean wc pos: 1337.111111111111


In [27]:
print((f"median wc pos: {stat.median(sorted([v for k, v in count_words0.items() if k in neg]))}\n" + 
      f"mean wc pos: {sum([v for k,v in count_words0.items() if k in neg]) / len([k for k,v in count_words0.items() if k in neg])}"))

median wc pos: 2067
mean wc pos: 3235.1111111111113


In [28]:
# TF-IDF / BM25
def calculate_scores(keywords, counts, count_words):
    ref_doc_len = stat.median(sorted([v for k,v in count_words.items()]))
    ret = []
    for k, v in counts.items():
        tf = sum([counts[k][key] for key in keywords])
        score = tf / (0.75 * count_words[k] / ref_doc_len) if count_words[k] > 0 else 0
        ret.append((k, score)) 
    return sort_by_score(ret)

In [29]:
init_scores = calculate_scores(keywords, counters0, count_words0)
init_scores

[('climatereality', 98.66666666666667),
 ('ClimateChangeNews', 98.66666666666666),
 ('Global-Climate-Scam-1392664020945768', 94.47798742138366),
 ('NatureClimateChange', 90.53344435192895),
 ('TheCO2Coalition', 84.08340607118612),
 ('ccdispatch', 82.84934086629002),
 ('australianclimatemadness', 77.74767189692562),
 ('ClimateChangeCauses', 60.51889452904681),
 ('iloveco2', 60.21673923560716),
 ('acespace', 55.13725490196079),
 ('ClimateChangeLIES', 49.965075669383005),
 ('NASAClimateChange', 47.90384202618588),
 ('ClimateDepot', 41.87317073170732),
 ('ClimateChangeIsReal', 41.20202279934859),
 ('GlobalwarmingEva', 39.07282913165266),
 ('bloomberggreen', 33.64389697648377),
 ('greenpeace.international', 31.031662926950883),
 ('Global-Warming-Climate-Change-whatever-its-called-is-a-scam-241741379521',
  28.80536912751678),
 ('cfact', 23.53610324686918),
 ('ClimateChangeIsNatural', 7.291879034998301)]

In [30]:
print((f"median score pos: {stat.median(sorted([v for k, v in init_scores if k in pos]))}\n" + 
      f"mean score pos: {sum([v for k,v in init_scores if k in pos]) / len([k for k,v in init_scores if k in pos])}"))

median score pos: 55.13725490196079
mean score pos: 62.525920444489756


In [31]:
print((f"median score neg: {stat.median(sorted([v for k, v in init_scores if k in neg]))}\n" + 
      f"mean score neg: {sum([v for k,v in init_scores if k in neg]) / len([k for k,v in init_scores if k in neg])}"))

median score neg: 60.21673923560716
mean score neg: 60.394984918540985


### Pos stranky jsou takove, na ktere se dostanu, kdyz zacnu v nektere z pozitivnich korenovych stranek a naopak pro negativni a neg label.

In [32]:
def get_class(pos_pages, neg_pages, pages):
    clima_pos, clima_neg = [], []
    for p in pages:
        if p[0] not in (pos_pages + neg_pages):
            print("Missing page ", page)
        if p[0] in pos_pages:
            clima_pos.append(p[0])
        if p[0] in neg_pages:
            clima_neg.append(p[0])
    return clima_pos, clima_neg

In [33]:
print(f"{len(pos_pages) + len(neg_pages)}\t{len(set(pos_pages + neg_pages))}")

3755	3357


In [34]:
duplicates = (len(pos_pages) + len(neg_pages) - len(set(pos_pages + neg_pages))) / 2
print(f"There are about {duplicates} pages in both pos and neg.")

There are about 199.0 pages in both pos and neg.


In [35]:
clima, nclima = [], []
prev_cl, prev_ncl = 0, 0
THRESHOLD = 20
for counts, count_words, name in zip([counters0, counters1, counters_pos2, counters_neg2], 
                                       [count_words0, count_words1, count_words_pos2, count_words_neg2], 
                                       ['root', '1', '2-pos', '2-neg']):
    tmp_clima, tmp_nclima = [], []
    scores = calculate_scores(keywords, counts, count_words)    
    for page, sc in scores:
        if page in selected_pages:
            if sc < THRESHOLD:  # is clima / nonclima
                nclima.append((page, sc))
                tmp_nclima.append((page, sc))
            else:
                clima.append((page, sc))
                tmp_clima.append((page, sc))
            
    print(f"{name}:\t        clima: {len(clima) - prev_cl}\t\tnon-clima: {len(nclima) - prev_ncl}")
    proclima, anticlima = get_class(pos_pages, neg_pages, tmp_clima)
    print(f"\tproclima: {len(proclima)}\tanticlima: {len(anticlima)}\n")
    # proclima + anticlima != clima protoze nektere stranky jsou v pos i neg 
    # (dostanu se na ne z obou root class)
    # cisla 2-pos / 2-neg nenesou informaci o relaci. tzn nejsem si jisty, zda lze uplatnit ve smysli 2-neg
    # vidim vice anticlima stranek u 2-neg musi tam byt neco na strane algoritmu - pro neg doporucuje casteji neg
    # to nelze rici - mel jsem spatne pos/neg splity pri generovani a vztahy bude lepsi pocitat az na celem grafu potom
    prev_cl = len(clima)
    prev_ncl = len(nclima)

root:	        clima: 18		non-clima: 0
	proclima: 9	anticlima: 9

1:	        clima: 68		non-clima: 188
	proclima: 55	anticlima: 41

2-pos:	        clima: 217		non-clima: 887
	proclima: 211	anticlima: 92

2-neg:	        clima: 47		non-clima: 399
	proclima: 8	anticlima: 47



In [39]:
print(f"CLIMA: {len(clima)}\nNON-CLIMA: {len(nclima)}")

CLIMA: 350
NON-CLIMA: 1474


### Split the pages by its root page class - pro-clima / anti-clima

In [42]:
# print(f"pos: {len(proclima)}\tneg: {len(anticlima)}")

In [43]:
# len(set(proclima + anticlima))

### Save output

In [241]:
with open('to-josef', 'w') as fw:
    for i in sorted(list(set(proclima + anticlima))):
        if i.strip:
            fw.write('https://www.facebook.com/' + i.strip() + '\n')

In [57]:
## create a single column of pagenames
with open('data/data2-uniq.csv', 'r') as f:
    with open('data/to-be-scraped-neg', 'w') as fw:
        for l in f.readlines():
            p = l.split(';')[0]
            if p in clima_neg:
                for page in l.split(';')[1:]:
                    if page.strip():
                        fw.write(f"{page.strip()}\n")