In [1]:
# First we install some libraries

from PyPDF2 import PdfReader
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
# We load one of the pdfs for analysis, in this example IPCC, AR6:

reader = PdfReader('IPCC_AR6.pdf')

In [3]:
# We make sure all text from the required pages is gathered in a string called 'text':

text=''
for i in range(16,45):
    page = reader.pages[i]
    t = page.extract_text()
    text = text + ' ' + t   

In [4]:
# You can check how it looks by running this cell:

text



In [7]:
# We remove the (){} from all words and we count how often each word occurs and save this in a dataframe called 'words_IPCC'
per_word = text.split()
for i in range(len(per_word)):
    per_word[i]=per_word[i].replace('(','').replace(')','').replace('{','').replace('}','')
words_IPCC=pd.DataFrame.from_dict(Counter(per_word), orient='index').reset_index()
words_IPCC.columns=['word','freq']


In [8]:
# You can check what it looks like by typing per_word or words_IPCC:

words_IPCC

# Can you already spot some mistakes? Can you think of pre-processing steps to fix those mistakes?

Unnamed: 0,word,freq
0,5SPM,1
1,Summary,31
2,for,202
3,PolicymakersA:,1
4,Introduction,1
...,...,...
4279,closing,1
4280,secure,1
4281,liveable,1
4282,"SM16.24,",1


In [9]:
# We load the file with the sentiment scores (The ANEW lexicon)
sent=pd.read_csv('ANEW.txt',sep='\t',header=None,names=['word','wordnr','ValMN','ValSD','AroMN','AroSD','DomMN','DomSD','Frequency'])
sent.ValMN=sent.ValMN-5

In [10]:
# Check how it looks:

sent

Unnamed: 0,word,wordnr,ValMN,ValSD,AroMN,AroSD,DomMN,DomSD,Frequency
0,abduction,621,-2.24,2.06,5.53,2.43,3.49,2.38,1
1,abortion,622,-1.50,2.30,5.39,2.80,4.59,2.54,6
2,absurd,623,-0.74,1.82,4.36,2.20,4.73,1.72,17
3,abundance,624,1.59,2.01,5.51,2.63,5.80,2.16,13
4,abuse,1,-3.20,1.23,6.83,2.70,3.69,2.94,18
...,...,...,...,...,...,...,...,...,...
1029,yacht,1037,1.95,1.79,5.61,2.72,6.10,2.13,4
1030,yellow,545,0.61,1.94,4.43,2.05,5.47,1.58,55
1031,young,1038,1.89,2.12,5.64,2.51,5.30,2.49,385
1032,youth,1039,1.75,2.29,5.67,2.52,5.11,2.55,82


In [11]:
# Merge the dataframes:

df_IPCCsent= pd.merge(words_IPCC, sent, how="inner", on='word')

In [12]:
# Separate into positive and negative sentiment:

df_IPCCsent['valpos']=df_IPCCsent[df_IPCCsent['ValMN'] >= 0].freq*df_IPCCsent[df_IPCCsent['ValMN'] >= 0].ValMN
df_IPCCsent['valneg']=df_IPCCsent[df_IPCCsent['ValMN'] < 0].freq*df_IPCCsent[df_IPCCsent['ValMN'] < 0].ValMN

In [13]:
# Normalize on number of words: 

AR6_pm_neg = sum(df_IPCCsent[df_IPCCsent['ValMN'] < 0].valneg) / len(per_word)
AR6_pm_pos = sum(df_IPCCsent[df_IPCCsent['ValMN'] >= 0].valpos) / len(per_word)

In [14]:
AR6_pm_neg, AR6_pm_pos

(-0.005695568223611623, 0.0381339646929018)