# Extracting Gendered Constructions

Look at Laura Klein NB

In [30]:
import spacy, warnings; warnings.simplefilter('ignore')
import csv
import re

In [16]:
nlp = spacy.load('en_core_web_sm')

In [52]:
fname = '/scratch/group/pract-txt-mine/sbuongiorno/hansard_decades/hansard_1800.csv'

with open(fname, newline='') as f:
    reader = csv.reader(f)
    data = list(reader)[1:]
    data = list(map(str, data))

data = data[:20]

In [57]:
data = [re.sub(r'\b[A-Z]+(?:\s+[A-Z]+)*\b', '', ls) for ls in data] # remove words that are all upper case - so names 
data = [re.sub(r'\\\\n|\\\\t|\'s', '', ls) for ls in data] # remove lien breaks, tab breaks, and possessive "s"
data = [re.sub(r'[^\w\s]|_', '', ls) for ls in data] # remove punctuation and underscore
data = [re.sub(r'\d{1, 3}', '', ls) for ls in data] # remove digits that are a minimum of 1 and a maximum of 3
data = [re.sub(r'\w*\d\w*', '', ls) for ls in data] # remove character strings that contain a digit
            
data = [word.lower() for word in data]

data = ' '.join(data)

In [61]:
doc = nlp(data)

In [64]:
def extractTokens(spacy_doc_object):
    spacy_tokens = []
    for token in spacy_doc_object:
        row = (token.text, token.lemma_, token.pos_, token.dep_, token.head.text)
        spacy_tokens.append(row)
    return spacy_tokens

reddit_tokens = extractTokens(doc)


In [65]:
reddit_tokens[:10]

[(' ', ' ', 'SPACE', '', 'moved'),
 ('moved', 'move', 'VERB', 'ROOT', 'moved'),
 ('that', 'that', 'SCONJ', 'mark', 'appointed'),
 ('lord', 'lord', 'PROPN', 'compound', 'walsingham'),
 ('walsingham', 'walsingham', 'PROPN', 'nsubjpass', 'appointed'),
 ('be', 'be', 'AUX', 'auxpass', 'appointed'),
 ('appointed', 'appoint', 'VERB', 'ccomp', 'moved'),
 ('chairman', 'chairman', 'NOUN', 'oprd', 'appointed'),
 ('of', 'of', 'ADP', 'prep', 'chairman'),
 ('the', 'the', 'DET', 'det', 'committee')]

In [62]:
for word in doc:
    print(word.lemma_)

 
move
that
lord
walsingham
be
appoint
chairman
of
the
committee
of
privilege
for
the
present
session
 
the
subject
in
-PRON-
contemplation
be
such
as
not
only
that
house
but
many
thousand
of
-PRON-
majesty
subject
must
regard
with
the
utmost
concern
 
-PRON-
agree
with
the
noble
earl
as
to
the
great
importance
of
the
subject
-PRON-
be
one
therefore
which
naturally
attract
the
serious
attention
of
-PRON-
majesty
government
-PRON-
lordship
however
would
be
at
the
same
time
aware
of
the
complexity
and
intricate
nature
of
the
general
subject
the
variety
of
detail
which
-PRON-
embrace
and
the
correspondent
difficulty
of
form
adequate
regulation
 
-PRON-
be
convince
for
-PRON-
own
part
that
the
contrary
be
the
fact
and
though
die
abuse
have
not
be
flagrant
or
excessive
that
the
bank
have
yield
in
a
certain
degree
to
the
temptation
and
extend
the
quantity
of
-PRON-
note
beyond
the
proper
limit
 
at
present
-PRON-
be
  
the
period
of
-PRON-
present
bill
be
somewhat
different
from
that
of
the


In [None]:
def extractTokens(spacy_doc_object):
    spacy_tokens = []
    for doc in spacy_doc_object:
        for token in doc:
            row = (token.text, token.lemma_, token.pos_, token.dep_, token.head.text)
            spacy_tokens.append(row)
    return spacy_tokens

reddit_tokens = extractTokens(reddit_sample_he_she_text)

In [None]:
from spacy.symbols import nsubj, nsubjpass, VERB

def extractPairs(spacy_doc_object):
    pairs = []
    for doc in spacy_doc_object:
        for subject in doc:
            if subject.dep == nsubj or subject.dep == nsubjpass and subject.head.pos == VERB:
                extracted_pairs = subject.text, subject.head.lemma_
                concat_extracted_pairs = ' '.join(extracted_pairs)
                pairs.append(str(concat_extracted_pairs))
    return pairs

pairs = extractPairs(reddit_sample_he_she_text)

In [None]:
lowercase_pairs = [word.lower() for word in pairs]

In [None]:
import re

def extract_he(pairs):
    regex = re.compile('he ')
    male = [word for word in lowercase_pairs if regex.match(word)]
    return male

male = extractMalePairs(lowercase_pairs)

In [None]:
stopwords = ['right', 'hon', 'general', 'mr', 'shall', 'majesty', 'exchequer', 'address', 'motion', 'bill',
             'earl', 'friend', 'chancellor', 'sense', 'object', 'suppose', 'amidst', 'noble', 'lord', 'agree',
             'speech', 'kind', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september',
             'october', 'november', 'december', 'order', 'held', 'hold', 'assembly', '£', 'humble', 'about', 'above',
             'across', 'address', 'after', 'afterwards', 'again', 'against', 'agree', 'all', 'almost', 'alone', 'along',
             'already', 'also', 'although', 'always', 'am', 'amidst', 'among', 'amongst', 'amount', 'an', 'and', 'another',
             'any', 'anyhow', 'anything', 'anyway', 'anywhere', 'april', 'are', 'around', 'as', 'at', 'back', 'be', 'was',
             'becames', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
             'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'do',
             'could', 'did', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'either', 'else', 'elsewhere',
             'empty', 'enough', 'even', 'ever', 'everyone', 'everything', 'everywhere', 'except', 'few', 'first', 'for',
             'from', 'front', 'full', 'further', 'general', 'get', 'give', 'go', 'had', 'has', 'have', 'held', 'hence',
             'here', 'hereby', 'herein', 'hold', 'however', 'if', 'in', 'indeed', 'into', 'is', 'it', 'its', 'itself',
             'just', 'keep', 'kind', 'last', 'latter', 'least', 'less', 'made', 'make', 'many', 'me', 'meanwhile', 'might',
             'mine', 'more', 'most', 'mostly', 'move', 'much', 'must', 'name,' 'neither', 'never', 'nevertheless', 'next',
             'no', 'noble', 'nobody', 'none', 'nor', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'one', 'once',
             'only', 'onto', 'or', 'other', 'others', 'otherwise', 'out', 'over', 'own', 'part', 'perhaps', 'please', 'put',
             'quite', 'rather', 'really', 'regarding', 'same', 'say', 'see', 'seem', 'seemed', 'seeming', 'seems', 'sense',
             'several', 'shall', 'should', 'show', 'side', 'since', 'so', 'some', 'someone', 'something', 'sometime',
             'still', 'such', 'suppose', 'take', 'than', 'that', 'the', 'then', 'there', 'thereby', 'therefore', 'these',
             'they', 'this', 'those', 'though', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'under',
             'unless', 'until', 'up', 'upon', 'us', 'used', 'using', 'various', 'very', 'was', 'we', 'well', 'were', 'what',
             'whatever', 'when', 'when', 'whereby', 'whether', 'which', 'while', 'who', 'why', 'will', 'with', 'within',
             'without', 'would', 'yet', 'bright', 'mr.', 'hansard', 'lancashire', '[]', '£1', '000', 'russell', 'committee',
             'reading', 'learned', 'deal', 'time', 'royal', 'gentlemen', 'gentleman', 'year', 'years', 'affairs', 'affair',
             'academy', 'sir', 'thought', 'took', 'bring', 'brings', 'brought', 'forward', 'great', 'good', 'department',
             'treasury', 'second', 'take', 'taken', 'privy', 'member', 'robert', 'large', 'session', 'secretary', 'notice',
             'moment', 'think', 'believe', 'hope', 'ask', 'hear', 'beg', 'support', 'state', 'find', 'admit', 'wish',
             'refer', 'reply', 'know', 'feel', 'propose', 'understand', 'let', 'allow', 'like', 'receive', 'consider',
             'begin', 'tell', 'like', 'send', 'ought', 'come', 'intend', 'add', 'want', 'stand', 'suggest', 'remind',
             'use', 'mean', 'suggest', 's']


In [None]:
def removeStopWords(pairs):
    keep_pairs = []
    for pair in pairs:
        tokens = pair.split(" ")
        tokens_filtered = [word for word in tokens if not word in stopwords]
        joined_tokens = " ".join(tokens_filtered)
        if len(tokens_filtered) == 2: # if string is length of two
            keep_pairs.append(str(joined_tokens)) # keep the string
    return keep_pairs

male_no_stopwords = removeStopWords(male)

In [None]:
def countWords(clean_data):
    ngrams_dict = {}
    for ngram in clean_data:
        if ngram in ngrams_dict:
            ngrams_dict[ngram] += 1
        else:
            ngrams_dict[ngram] = 1
    return ngrams_dict
            
male_dictionary = countWords(male_no_stopwords)

In [None]:
for kv in list(male_dictionary)[:30]:
    print(kv,  male_dictionary[kv])

Now get pandas code