# **Climate Change in American Publications**
based on https://www.dataquest.io/blog/tutorial-text-analysis-python-test-hypothesis/

In [None]:
import pandas as pd

articles = pd.read_csv("/kaggle/input/allthenews/articles.csv")

In [None]:
len(articles)

In [None]:
articles.head()

In [None]:
articles.publication.unique()

In [None]:
articles['year'].min()

In [None]:
articles['year'].max()

In [None]:
articles['year'].value_counts()

In [None]:
def clean_text(article):
    clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", article.lower())
    return re.sub(r'\W+', ' ', clean1)

In [None]:
import string
import re

articles['tokenized'] = articles['content'].astype(str).map(lambda x: clean_text(x))

In [None]:
articles['tokenized'].head()

In [None]:
articles['num_wds'] = articles['tokenized'].apply(lambda x: len(x.split()))
articles['num_wds'].mean()

In [None]:
articles['num_wds'].max()

In [None]:
articles['num_wds'].min()

In [None]:
len(articles[articles['num_wds']==0])

In [None]:
articles = articles[articles['num_wds']>0]
articles['num_wds'].mean()

In [None]:
articles['num_wds'].min()

In [None]:
ax=articles['num_wds'].plot(kind='hist', bins=50, fontsize=14, figsize=(12,10))
ax.set_title('Article Length in Words\n', fontsize=20)
ax.set_ylabel('Frequency', fontsize=18)
ax.set_xlabel('Number of Words', fontsize=18);

In [None]:
articles['uniq_wds'] = articles['tokenized'].str.split().apply(lambda x: len(set(x)))
articles['uniq_wds'].head()

In [None]:
articles['uniq_wds'].mean()

In [None]:
articles['uniq_wds'].min()

In [None]:
articles['uniq_wds'].max()

In [None]:
ax=articles['uniq_wds'].plot(kind='hist', bins=50, fontsize=14, figsize=(12,10))
ax.set_title('Unique Words Per Article\n', fontsize=20)
ax.set_ylabel('Frequency', fontsize=18)
ax.set_xlabel('Number of Unique Words', fontsize=18);

In [None]:
import numpy as np

art_grps = articles.groupby('publication')

ax=art_grps['uniq_wds'].aggregate(np.mean).plot(kind='bar', fontsize=14, figsize=(12,10))
ax.set_title('Mean Number of Unique Words per Article\n', fontsize=20)
ax.set_ylabel('Mean Number of Unique Words', fontsize=18)
ax.set_xlabel('Publication', fontsize=18);

In [None]:
from collections import Counter

wd_counts = Counter()
for i, row in articles.iterrows():
    wd_counts.update(row['tokenized'].split())

In [None]:
wd_counts.most_common(20)

In [None]:
from nltk.corpus import stopwords

for sw in stopwords.words('english'):
    del wd_counts[sw]

In [None]:
wd_counts.most_common(20)

In [None]:
disj = re.compile(r'(chang\w+\W+(?:\w+\W+){1,5}?climate)|(climate\W+(?:\w+\W+){1,5}?chang)')

In [None]:
disj.match('climate is changing')

In [None]:
disj.match('change in extreme  climate')

In [None]:
disj.match('nothing changing here except the weather')

In [None]:
def find_cc_wds(content, cc_wds=['climate change','global warming', 'extreme weather', 'greenhouse gas'
                                 'clean energy', 'clean tech', 'renewable energy']
):
    found = False
    for w in cc_wds:
        if w in content:
            found = True
            break

    if not found:
        disj = re.compile(r'(chang\w+\W+(?:\w+\W+){1,5}?climate) | (climate\W+(?:\w+\W+){1,5}?chang)')
        if disj.match(content):
            found = True
    return found

In [None]:
articles['cc_wds'] = articles['tokenized'].apply(find_cc_wds)
articles['cc_wds'].head()

In [None]:
articles['cc_wds'].sum() / len(articles)

In [None]:
art_grps['cc_wds'].sum()

In [None]:
art_grps['cc_wds'].count()

In [None]:
proportions = art_grps['cc_wds'].sum() / art_grps['cc_wds'].count()
proportions.sort_values(ascending=True)
proportions

In [None]:
ax=proportions.sort_values(ascending=False).plot(kind='bar', fontsize=14, figsize=(12,10))
ax.set_title('Mean Proportion of Climate Change Related Articles per Publication (Sorted)\n', fontsize=20)
ax.set_ylabel('Mean Proportion', fontsize=18)
ax.set_xlabel('Publication', fontsize=18);

In [None]:
#liberal, conservative, and center
bias_assigns = {'Atlantic': 'left', 'Breitbart': 'right', 'Business Insider': 'left', 'Buzzfeed News': 'left', 'CNN': 'left', 'Fox News': 'right',
                'Guardian': 'left', 'National Review': 'right', 'New York Post': 'right', 'New York Times': 'left',
                'NPR': 'left', 'Reuters': 'center', 'Talking Points Memo': 'left', 'Washington Post': 'left', 'Vox': 'left'}
articles['bias'] = articles['publication'].apply(lambda x: bias_assigns[x])

articles.head()

In [None]:
bias_groups = articles.groupby('bias')
bias_proportions = bias_groups['cc_wds'].sum() / bias_groups['cc_wds'].count()

In [None]:
bias_groups['cc_wds'].count()

In [None]:
ax=bias_proportions.plot(kind='bar', fontsize=14, figsize=(12,10))
ax.set_title('Proportion of climate change articles by Political Bias\n', fontsize=20)
ax.set_xlabel('Bias', fontsize=18)
ax.set_ylabel('Proportion', fontsize=18);