# Pronoun percentages (see paper section 4.6)
According to Pennebaker (2010), the percentage of first person pronoun use is useful. As the tagger that we used from Nini (2015) only outputs relative frequencies per 100 tokens, we preprocess the full manifestos in order to get the percentages of first person pronoun use compared to the total number of words, as the total number of pronouns.

In [None]:
import pandas as pd
import glob
import os
import re
import nltk
from nltk.tokenize import word_tokenize
nltk__word_tokenizer = word_tokenize

import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
############# LOADING IN THE FULL MANIFESTOS CORRECTLY ################ 

def get_df_full_manifestos(path):
    """Function that takes a path with .txt files of full manifestos and returns a dataframe with four columns: filename, year, the full manifesto text, and the cleaned
    full manifesto text."""
    items = []

    for filename in glob.glob(path): 
        with open(filename) as f: 
            text = f.read()

        fn = os.path.basename(filename).replace(".txt", "")
        

        items.append((fn, text))
    
    filenames, texts = zip(*items)
    
    df = pd.DataFrame({"filenames" : [tup[0] for tup in items],
                 "texts" : [tup[1] for tup in items]})
    
    df.filenames = df.filenames.str.replace("2015", "2016") # change 2015 to 2016 (we made a mistake, the publication was actually in 2016)
    df.insert(1, "year", [file[:4] for file in df["filenames"]]) # insert year column
    df["clean_text"] = df["texts"].str.replace("\n", " ") # replace the \n in the text column by a space
    return df


########## PREPROCESSING functions to get pronoun percentages ############

def pos_tag(doc):
    """Function that POS-tags the text document column."""
    return " ".join([token.tag_ for token in nlp(doc)])

# function to obtain total number of pronouns
def get_pron_count(doc):
    """Function that instantiates a counter that counts all the instances of pronouns in the pos-tagged text."""
    cnt = 0
    pronoun_list = ["PRP", "PRON", "PRP$", "PDAT", "PDS", 
                    "PIAT", "PIDAT", "PIS", "PPER", "PPOSAT", 
                    "PPOSS", "PRELAT", "PRELS", "PRF", "PWAT", "PWAV", "PWS", "PN"]
    
    for pronoun in doc.split(): 
       # for pronoun in document: 
        if pronoun in pronoun_list: 
            cnt += 1
        else: 
            pass 
    return cnt

def get_n_words(doc):
    """Function that returns the number of words"""
    return len(doc.split())

def get_first_person_sg_cnt(doc):
    """Function that counts the first person pronouns I in the text column using Regex.
    This regex string excludes instances of 'i' in the string 'i.e.'."""
    doc = doc.lower()
    pattern = re.compile(r"(\bi\b(?!.e))")
    return len(re.findall(pattern, doc))

def get_percentage_of_pronouns(n_pronouns, n_first_pers_sg):
    return (n_first_pers_sg/n_pronouns)*100

In [None]:
if __name__ == "__main__":
    df = get_df_full_manifestos("/Users/lorenverreyen/desktop/Manifestos/*manifesto.txt")
    df.insert(4, "pos_tags", df.clean_text.apply(pos_tag))
    df.insert(5, "n_words", df.clean_text.apply(get_n_words))
    df.insert(6, "total_n_pronouns", df.pos_tags.apply(get_pron_count))
    df.insert(7, "n_first_pers_sg", df.clean_text.apply(get_first_person_sg_cnt))
    df.insert(8, "%_all_words", get_percentage_of_pronouns(df["n_words"], df["n_first_pers_sg"]))
    df.insert(9, "%all_pronouns", get_percentage_of_pronouns(df["total_n_pronouns"], df["n_first_pers_sg"]))

In [None]:
df.head()

Unnamed: 0,filenames,year,texts,clean_text,pos_tags,n_words,total_n_pronouns,n_first_pers_sg,%_all_words,%all_pronouns
0,2016_manifesto,2016,ANTI-TECH REVOLUTION:\n\nWHY AND HOW\n\n\nPREF...,ANTI-TECH REVOLUTION: WHY AND HOW PREFACE ...,JJ HYPH JJ NN : _SP WRB CC WRB _SP NN _SP EX V...,66625,2735,78,0.117073,2.85192
1,1995_manifesto,1995,The Unabomber Manifesto1\n\nIndustrial Society...,The Unabomber Manifesto1 Industrial Society a...,DT NNP NNP _SP NNP NNP CC PRP$ NN IN NNP NNP _...,33976,1769,5,0.014716,0.282646
2,2010_manifesto,2010,1. The Industrial Revolution and its consequen...,1. The Industrial Revolution and its consequen...,LS . DT NNP NNP CC PRP$ NNS VBP VBN DT NN IN D...,30735,1602,3,0.009761,0.187266


---

glossary of pronoun pos-tags: 

https://github.com/explosion/spaCy/blob/master/spacy/glossary.py

- "PRP": "pronoun, personal"
- "PRON": "pronoun",
-  "PRP\$": "pronoun, possessive",
- "PDAT": "attributive demonstrative pronoun",
- "PDS": "substituting demonstrative pronoun",
- "PIAT": "attributive indefinite pronoun without determiner",
- "PIDAT": "attributive indefinite pronoun with determiner",
- "PIS": "substituting indefinite pronoun",
- "PPER": "non-reflexive personal pronoun",
- "PPOSAT": "attributive possessive pronoun",
- "PPOSS": "substituting possessive pronoun",
- "PRELAT": "attributive relative pronoun",
- "PRELS": "substituting relative pronoun",
- "PRF": "reflexive personal pronoun",
- "PWAT": "attributive interrogative pronoun",
- "PWAV": "adverbial interrogative or relative pronoun",
- "PWS": "substituting interrogative pronoun",
- "PN": "pronoun",