<a href="https://colab.research.google.com/github/chutki26/dissertation-newscrawler/blob/main/DetectingActiveVsPassiveVoice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U spacy[cuda-autodetect]
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from en-core-web-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.0-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.12.0->spacy-curated-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import json
from tqdm.auto import tqdm
import re
import spacy
from spacy.matcher import Matcher

In [None]:
start_file_path = '/content/drive/My Drive/articles'

In [None]:
df = pd.read_pickle(f'{start_file_path}/relevant_articles/df_relevant_articles.pkl')

In [None]:
df['source_domain'].value_counts()

Unnamed: 0_level_0,count
source_domain,Unnamed: 1_level_1
theguardian.com,5543
independent.co.uk,3053
hindustantimes.com,2187
news18.com,1923
apnews.com,1903
dailymail.co.uk,1697
indianexpress.com,1380
bbc.com,1317
cnn.com,1172
foxnews.com,1172


In [None]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

In [None]:
# create rules : taken from https://stackoverflow.com/questions/74528441/detect-passive-or-active-sentence-from-text first answer

passive_rules = [
    [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'VBN'}],
    [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'VBZ'}],
    [{'DEP': 'nsubjpass'}, {'DEP': 'aux', 'OP': '*'}, {'DEP': 'auxpass'}, {'TAG': 'RB'}, {'TAG': 'VBN'}],
]

active_rules = [
    [{'DEP': 'nsubj'}, {'TAG': 'VBD', 'DEP': 'ROOT'}],
    [{'DEP': 'nsubj'}, {'TAG': 'VBP'}, {'TAG': 'VBG', 'OP': '!'}],
    [{'DEP': 'nsubj'}, {'DEP': 'aux', 'OP': '*'}, {'TAG': 'VB'}],
    [{'DEP': 'nsubj'}, {'DEP': 'aux', 'OP': '*'}, {'TAG': 'VBG'}],
    [{'DEP': 'nsubj'}, {'TAG': 'RB', 'OP': '*'}, {'TAG': 'VBG'}],
    [{'DEP': 'nsubj'}, {'TAG': 'RB', 'OP': '*'}, {'TAG': 'VBZ'}],
    [{'DEP': 'nsubj'}, {'TAG': 'RB', 'OP': '+'}, {'TAG': 'VBD'}],
]



In [None]:
matcher = Matcher(nlp.vocab)  # Init. the matcher with a vocab (note matcher vocab must share same vocab with docs)
matcher.add('Passive',  passive_rules)  # Add passive rules to matcher
matcher.add('Active', active_rules)  # Add active rules to matcher

In [None]:
df_news = df[df['source_domain'] == 'india.com'] # change based on what news source currently being analysed

In [None]:
passive_sents = []
active_sents = []

for i in tqdm(range(len(df_news))):

  entire_doc = nlp(df_news.iloc[i]['full_text'])
  text = [sent.text for sent in entire_doc.sents] # create sentences -> split(".") doesn't work for things like U.N., U.S. etc.

  for sentence in text:
      doc = nlp(sentence)  # Process text with spaCy model
      matches = matcher(doc)  # Get matches
      if len(matches) > 0:
          for match_id, start, end in matches:
              string_id = nlp.vocab.strings[match_id]
              span = doc[start:end]  # the matched span
              if string_id == 'Passive':
                  passive_sents.append({'sentence':sentence, 'span':span})
              elif string_id == 'Active':
                  active_sents.append({'sentence':sentence, 'span':span})

passive_df = pd.DataFrame(passive_sents)
active_df = pd.DataFrame(active_sents)

passive_df.to_csv(f'{start_file_path}/active_vs_passive_voice/india_passive_sentences.csv', index=False) # change these to the name of the newssite
active_df.to_csv(f'{start_file_path}/active_vs_passive_voice/india_active_sentences.csv', index=False)