In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter

import nltk

import warnings
warnings.filterwarnings("ignore")

import string
import nltk
from nltk.corpus import stopwords

# Ensure the required NLTK resources are downloaded
nltk.download('stopwords')
  

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mansisaxena/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
DV_posts = pd.read_csv('domesticviolence_posts.csv') 

In [3]:
DV_posts.head()

Unnamed: 0,id,timestamp,title,text,speaker,reply_to,conversation_id,meta.score,meta.num_comments,meta.top_level_comment,meta.retrieved_on,meta.gilded,meta.gildings,meta.subreddit,meta.stickied,meta.permalink,meta.author_flair_text
0,o0pbq,1325565642,"My coworker is in an abusive relationship, and...","My coworker is in an abusive relationship, and...",DVsKat,,o0pbq,3,7,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/o0pbq/my_coworker...,
1,ocpar,1326308957,Why can't I bring myself to leave?,"Sorry if I ramble, I just feel Pike there is s...",dearlydistressedmmm,,ocpar,7,12,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/ocpar/why_cant_i_...,
2,onm6a,1326998843,My sister is in an abusive relationship and I ...,So I have been living with my sis for almost t...,[deleted],,onm6a,1,0,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/onm6a/my_sister_i...,
3,or4tz,1327209584,Four years ago my gay best friend beat me up. ...,I was 22; I am now 26. I'm female. My gay best...,[deleted],,or4tz,11,3,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/or4tz/four_years_...,
4,ot7hk,1327345546,Emotional abuse and blame,Currently in the middle on an intense situatio...,confusedorabused,,ot7hk,3,6,,-1,-1,,domesticviolence,False,/r/domesticviolence/comments/ot7hk/emotional_a...,


## Preprocessing

In [4]:
def preprocess_text(text):

    # Step 1: Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Step 2: Convert to lowercase
    text = text.lower()
    
    # Step 3: Remove numbers and nearby 'M', 'F', 'm', 'f', 'male', or 'female' with up to 2 additional characters
    text = re.sub(r'(male|female|[mf])?\d+(male|female|[mf])?', '', text, flags=re.IGNORECASE)
    
    # Step 4: Replace double spaces with single spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [5]:
DV_posts['processed_title'] = DV_posts['title'].apply(preprocess_text)
DV_posts['processed_text'] = DV_posts['text'].apply(preprocess_text)


In [6]:
# relevant_df['processed_text'].to_list()

## First heuristic - titles with 1st person personal pronoun

In [7]:
# Use regular expression to match whole words only

def contains_firstperson_personalpronouns(text):
    return bool(re.search(r'\b(me|mine|myself|im|ive)\b', text))

In [8]:
# Iterate through each row in df['preprocess_title']

relevant_indices1 = []

for index, row in DV_posts.iterrows():
    title = row['processed_title']
    if contains_firstperson_personalpronouns(title):
        relevant_indices1.append(index)
        


In [9]:
print(len(relevant_indices1))

relevant_df = DV_posts.loc[relevant_indices1].reset_index(drop=True)

439


In [10]:
# irrelevant_df = DV_posts.loc[~DV_posts.index.isin(relevant_indices1)]
# irrelevant_df.to_csv('ProcessedDownloadedData/domesticviolence/irrelevant.csv', index=False) 

## Second heuristic

In [11]:
from nltk.stem import WordNetLemmatizer

# Function to identify words that occur after the word "my" and convert them to their base form
def identify_personal_possessive_phrases(text):

    # Find matches for 'my <word>'
    matches = re.findall(r'\bmy\s+(\w+)\b', text, flags=re.IGNORECASE)
    
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize each match
    lemmatized_matches = [lemmatizer.lemmatize(match.lower()) for match in matches]
    
    return lemmatized_matches

DV_posts['PPPr_title'] = DV_posts['processed_title'].apply(identify_personal_possessive_phrases)

In [12]:
DV_posts['PPPr_title']

0           [coworker]
1                   []
2             [sister]
3        [gay, friend]
4                   []
             ...      
2334                []
2335          [abuser]
2336    [relationship]
2337                []
2338                []
Name: PPPr_title, Length: 2339, dtype: object

In [13]:
flattened_list = [item for sublist in DV_posts['PPPr_title'].to_list() for item in sublist]

In [14]:
word_counter = Counter(flattened_list)
freq = sorted(word_counter.items(), key=lambda x: x[1], reverse=True)

string = ""

for word, _ in freq:
    string = string + " " + word

In [15]:
matches = """ boyfriend abusive husband abuser ex mother mom father story girlfriend dad life gf family bf head partner 
wife mum daughter son chest fiancé therapist motherinlaw violent sanity phone abuse relationship name 
grandfather hair verbally younger fault car assault current exgf breaking stepfather exhusbands place
emotionally support situation bofriend physically yr physical option doubt mentally childhood exboyfriendabuser exlover 
job instinct abused weirdly birthday grandmother real victim young narcissistic dog stepdad 
last future room landlord pastpresent limit spouse split fear year violently 
mind face fathersame sick exgirlfriend bedroom tie case kid sleep office permission thing once
trust neck exhusband dream campus throw psych child work past miserable house domestic self nook 
binge trauma confidence man narcassist ptsd fiance open """


relevant_matches = matches.split()

In [1]:
# relevant_matches

In [17]:
relevant_indices2 = []
irrelevant_indices2 = []

# Iterate through each row in df['Personal_Possessive']
for index, row in DV_posts.iterrows():
    PPPr_row = row['PPPr_title']
    if not PPPr_row:  # If there are no matches, add the row to irrelevant
        irrelevant_indices2.append(index)
        continue
    
    for word in PPPr_row:
        if word in relevant_matches:
            relevant_indices2.append(index)
        else:
            irrelevant_indices2.append(index)   


In [18]:
len(relevant_indices2), len(irrelevant_indices2)

(497, 1927)

## Combining the relevant indices and saving to DataFrames

In [19]:
# Convert lists to sets
set1 = set(relevant_indices1)
set2 = set(relevant_indices2)

# Merge sets and convert back to a list
merged_relevant_indices = list(set1.union(set2))

In [20]:
len(merged_relevant_indices)

710

In [21]:
# Create new dataframes
relevant_df = DV_posts.loc[merged_relevant_indices].reset_index(drop=True)
irrelevant_df = DV_posts.loc[~DV_posts.index.isin(merged_relevant_indices)]

In [22]:
relevant_df.to_csv('domesticviolence_relevant.csv', index=False) 
irrelevant_df.to_csv('domesticviolence_irrelevant.csv', index=False) 


In [23]:
len(relevant_df), len(irrelevant_df)

(710, 1629)