In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
DATA_ROOT = Path("../../data")
TARGET_DATA = "climate_headlines_sentiment.csv"

data = pd.read_csv(DATA_ROOT / "raw/climate" / TARGET_DATA, index_col=0)
data

Unnamed: 0,Headline,Link,Content,Sentiment,Justification
0,Australia's year ahead in climate and environm...,https://www.abc.net.au/news/science/2024-01-23...,The year has barely started and extreme weath...,0.0,The headline is unclear about its direct impac...
1,Projections reveal the vulnerability of freshw...,https://news.griffith.edu.au/2024/01/09/projec...,"“Water from groundwater, rivers and rainfall ...",-0.5,The vulnerability of freshwater is concerning ...
2,"Record heat in 2023 worsened global droughts, ...",https://www.anu.edu.au/news/all-news/record-he...,2023 saw an increase in the frequency and int...,-1.0,The headline describes worsening environmental...
3,"It's not just the total rainfall ""“ why is eas...",https://www.theguardian.com/australia-news/202...,The number of storms in some regions is decre...,0.0,The headline is unclear about its stance on cl...
4,Expert Commentary: 2023 was the warmest year o...,https://www.csiro.au/en/news/all/news/2024/jan...,The European Union's Copernicus Climate Chang...,-0.5,While highlighting the reality of climate chan...
...,...,...,...,...,...
1019,"Boycott COP climate summit if Putin shows, Ukr...",https://www.politico.eu/article/boycott-climat...,The international community should shun the C...,-0.5,This is a politically charged statement and co...
1020,How climate change will impact outdoor activit...,https://news.mit.edu/2024/how-climate-change-w...,"Using the concept of “outdoor days,"" a study...",0.5,This provides valuable insights into how clima...
1021,Climate and Environment - The New York Times,https://www.nytimes.com/section/climate,Earth's warming could trigger sweeping change...,0.0,"This is a vague headline, and without more spe..."
1022,Climate Change News & Videos,https://abcnews.go.com/alerts/climatechange,Follow the latest Climate Change news stories...,0.0,"Another general headline, offering information..."


In [3]:
data.isna().sum(axis=0)

Headline         0
Link             0
Content          0
Sentiment        1
Justification    1
dtype: int64

In [4]:
data = data.dropna(subset=["Sentiment"]).reset_index(drop=True)

In [5]:
target = data["Sentiment"]
data = data["Headline"] + data["Content"]

In [6]:
data

0       Australia's year ahead in climate and environm...
1       Projections reveal the vulnerability of freshw...
2       Record heat in 2023 worsened global droughts, ...
3       It's not just the total rainfall "“ why is eas...
4       Expert Commentary: 2023 was the warmest year o...
                              ...                        
1018    Boycott COP climate summit if Putin shows, Ukr...
1019    How climate change will impact outdoor activit...
1020    Climate and Environment - The New York Times E...
1021    Climate Change News & Videos Follow the latest...
1022    Climate - BBC News Deadliest weather made wors...
Length: 1023, dtype: object

In [7]:
data[0]

"Australia's year ahead in climate and environment - ABC The year has barely started and extreme weather events are already in the headlines. Here are some more big environment issues to keep an eye on...."

In [8]:
vectorizer = TfidfVectorizer(
    strip_accents="ascii",
    stop_words="english",
    token_pattern=r"[A-Za-z]+",
)
data = vectorizer.fit_transform(data).toarray()

In [9]:
vectorizer.get_feature_names_out()[:100]

array(['aarti', 'abandoned', 'abc', 'abenomics', 'ability', 'ablaze',
       'able', 'abrahm', 'abroad', 'abs', 'absence', 'absorbing',
       'acapulco', 'accelerate', 'accelerated', 'accelerating',
       'acceleration', 'access', 'accidents', 'accommodative',
       'accomplished', 'accord', 'according', 'accountability',
       'accurately', 'accus', 'accuses', 'achieve', 'achieved',
       'achievements', 'achieving', 'acknowledged', 'acquaints', 'acres',
       'act', 'acting', 'action', 'actionraising', 'actions', 'active',
       'actively', 'activism', 'activist', 'activists', 'activities',
       'activity', 'actor', 'actors', 'actual', 'actually', 'acute',
       'acutely', 'adam', 'adapt', 'adaptation', 'adapting', 'added',
       'addition', 'additional', 'additionally', 'address', 'addressed',
       'addresses', 'addressing', 'adds', 'adjustment', 'administration',
       'administrative', 'administrator', 'admissible', 'admit', 'adopt',
       'adopted', 'adopting', 'ad

In [10]:
data.shape

(1023, 3981)

In [11]:
data

array([[0.        , 0.        , 0.24209752, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.22514621, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [25]:
np.unique(target)

array([-1. , -0.5,  0. ,  0.5,  1. ])

In [26]:
target = (target * 2 + 2).astype(np.uint8).to_numpy()

print(np.unique(target))
target

[0 1 2 3 4]


array([2, 1, 0, ..., 2, 2, 2], dtype=uint8)

In [27]:
with open(DATA_ROOT / "processed/climate" / TARGET_DATA.replace(".csv", ".npz"), "xb") as f:
    np.savez(
        f,
        data=data,
        target=target,
        spatial=data,
    ) 