In [None]:
!pip install -U spacy[cuda-autodetect]
!python -m spacy download en_core_web_trf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import json
from tqdm.auto import tqdm
import re
import spacy
from spacy.matcher import Matcher

In [None]:
start_file_path = '/content/drive/My Drive/articles'

In [None]:
df = pd.read_pickle(f'{start_file_path}/relevant_articles/df_relevant_articles.pkl')

In [None]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

In [None]:
# create rules : taken from https://stackoverflow.com/questions/74528441/detect-passive-or-active-sentence-from-text first answer

passive_rules = [
    [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'VBN'}],
    [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'VBZ'}],
    [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'RB'}, {'TAG': 'VBN'}],
]

active_rules = [
    [{'DEP': 'nsubj'}, {'TAG': 'VBD', 'DEP': 'ROOT'}],
    [{'DEP': 'nsubj'}, {'TAG': 'VBP'}, {'TAG': 'VBG', 'OP': '!'}],
    [{'DEP': 'nsubj'}, {'DEP': 'aux', 'OP': '*'}, {'TAG': 'VB'}],
    [{'DEP': 'nsubj'}, {'DEP': 'aux', 'OP': '*'}, {'TAG': 'VBG'}],
    [{'DEP': 'nsubj'}, {'TAG': 'RB', 'OP': '*'}, {'TAG': 'VBG'}],
    [{'DEP': 'nsubj'}, {'TAG': 'RB', 'OP': '*'}, {'TAG': 'VBZ'}],
    [{'DEP': 'nsubj'}, {'TAG': 'RB', 'OP': '+'}, {'TAG': 'VBD'}],
]

In [None]:
matcher = Matcher(nlp.vocab)  # Init. the matcher with a vocab (note matcher vocab must share same vocab with docs)
matcher.add('Passive',  passive_rules)  # Add passive rules to matcher
matcher.add('Active', active_rules)  # Add active rules to matcher

In [None]:
df_guardian = df[df['source_domain'] == 'theguardian.com']

In [None]:
passive_sents = []
active_sents = []

skipped_indices = []

In [None]:
for i in range(len(df_guardian)):
  full_text = df_guardian.iloc[i]['full_text']
  length = len(full_text)

  if i >= 4007 and length > 17_000:
      skipped_indices.append(i)
  elif length > 20_000:
      skipped_indices.append(i)

In [None]:
len(skipped_indices)

386

In [None]:
df_guardian = df_guardian.iloc[skipped_indices]

for i in tqdm(range(len(df_guardian))):

  full_text = df_guardian.iloc[i]['full_text']

  entire_doc = nlp(full_text)
  text = [sent.text for sent in entire_doc.sents] # create sentences -> split(".") doesn't work for things like U.N., U.S. etc.

  for sentence in text:
      doc = nlp(sentence)  # Process text with spaCy model
      matches = matcher(doc)  # Get matches
      if len(matches) > 0:
          for match_id, start, end in matches:
              string_id = nlp.vocab.strings[match_id]
              span = doc[start:end]  # the matched span
              if string_id == 'Passive':
                  passive_sents.append({'sentence':sentence, 'span':span})
              elif string_id == 'Active':
                  active_sents.append({'sentence':sentence, 'span':span})

passive_df = pd.DataFrame(passive_sents)
active_df = pd.DataFrame(active_sents)

passive_df.to_csv(f'{start_file_path}/active_vs_passive_voice/theguardian_overlength_passive_sentences.csv', index=False)
active_df.to_csv(f'{start_file_path}/active_vs_passive_voice/theguardian_overlength_active_sentences.csv', index=False)

  0%|          | 0/386 [00:00<?, ?it/s]

In [None]:
guardianavp_start = f'{start_file_path}/active_vs_passive_voice/theguardian'

In [None]:
# passive sentences
p_upto4007 = f'{guardianavp_start}_upto4007_passive_sentences.csv'
p_from4007 = f'{guardianavp_start}_from4007_passive_sentences.csv'
p_overlength = f'{guardianavp_start}_overlength_passive_sentences.csv'
p_fps = [p_upto4007, p_from4007, p_overlength]

# active sentences
a_upto4007 = f'{guardianavp_start}_upto4007_active_sentences.csv'
a_from4007 = f'{guardianavp_start}_from4007_active_sentences.csv'
a_overlength = f'{guardianavp_start}_overlength_active_sentences.csv'
a_fps = [a_upto4007, a_from4007, a_overlength]

In [None]:
passive_df = pd.concat([pd.read_csv(fp) for fp in p_fps])
active_df = pd.concat([pd.read_csv(fp) for fp in a_fps])

In [None]:
len(passive_df), len(active_df)

(52468, 335085)

In [None]:
passive_df.to_csv(f'{start_file_path}/active_vs_passive_voice/theguardian_passive_sentences.csv', index=False)
active_df.to_csv(f'{start_file_path}/active_vs_passive_voice/theguardian_active_sentences.csv', index=False)