In [None]:
#Final_dataset refers to dataset obtained from combining TCPD-IPD questions with the webscraped questions from 2019-2024 
url = "final_dataset.csv"
import pandas as pd
df = pd.read_csv(url)

In [5]:
# TEXT PRE-PROCESSING

df['answer_text'] = df['answer_text'].astype(str)
df['subject']= df['subject'].astype(str)

import re

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alphabets and spaces
    return cleaned_text
df['answer_text'] = df['answer_text'].apply(clean_text)


# Remove 'answer' (case insensitive)
df['answer_text'] = df['answer_text'].str.replace('answer', ' ', case=False)

# List of Indian states and capitals
indian_states = [
    'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Goa', 'Gujarat', 'Haryana',
    'Himachal Pradesh', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
    'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana',
    'Tripura', 'Uttarakhand', 'Uttar Pradesh', 'West Bengal', 'Andaman and Nicobar Islands', 'Chandigarh',
    'Dadra' 'Nagar Haveli', 'Daman', 'Diu', 'NCT' , 'Jammu' 'Kashmir', 'Ladakh',
    'Lakshadweep', 'Puducherry'
]

indian_capitals = [
    'Amaravati', 'Itanagar', 'Dispur', 'Patna', 'Raipur', 'Panaji', 'Gandhinagar', 'Chandigarh', 'Shimla',
    'Ranchi', 'Bangalore', 'Thiruvananthapuram', 'Bhopal', 'Mumbai', 'Imphal', 'Shillong', 'Aizawl', 'Kohima',
    'Bhubaneshwar', 'Chandigarh', 'Jaipur', 'Gangtok', 'Chennai', 'Hyderabad', 'Agartala', 'Dehradun', 'Lucknow',
    'Kolkata', 'Port Blair', 'Chandigarh', 'Daman', 'Delhi', 'Srinagar, Jammu', 'Leh', 'Kavaratti', 'Puducherry'
]

# Function to remove Indian states and capitals from text
def remove_indian_places(text):
    # Compile regex pattern to match all Indian states and capitals
    pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, indian_states + indian_capitals)) + r')\b', flags=re.IGNORECASE)
    # Substitute matched words with empty string
    cleaned_text = pattern.sub('', text)
    return cleaned_text

# Apply the function to the 'answer_text' column
df['answer_text'] = df['answer_text'].apply(remove_indian_places)


#REMOVE Minister of State line
import re
def remove_prefix(text):
    # Define the pattern to match the prefix
    pattern = r'^(the\s+)?minister\s+of\s+state\s*'
    # Search for the pattern
    match = re.match(pattern, text, flags=re.IGNORECASE)
    if match:
        # Remove the prefix and words until the first occurrence of a single alphabet 'a/A'
        return re.sub(r'^.*?(\s[a-zA-Z]\s|$)', '', text)
    else:
        return text

df['answer_text'] = df['answer_text'].apply(remove_prefix)
df['answer_text'] = df['answer_text'].str.lstrip()

# Remove single occurring alphabets
df['answer_text'] = df['answer_text'].str.strip()  
df['answer_text'] = df['answer_text'].str.replace(r'\b\w\b', '', regex=True)

#Lowercase
df['answer_text'] = df['answer_text'].str.lower()
df['answer_text'] = df['answer_text'].str.strip()


# Removing certain non-semantic words
def remove_words(text):
    words_to_remove = ['no', 'yes', 'madam', 'answer', 'answers', 'question', 
                       'statement', 'table', 'laid', 'reply', 'ministry', 
                       'sushri', 'shri', 'shrimati', 'dr']
    for word in words_to_remove:
        text = re.sub(r'\b{}\b'.format(re.escape(word)), '', text, flags=re.IGNORECASE)
    return text

# Assuming df['answer_text'] contains the text data
df['answer_text'] = df['answer_text'].apply(remove_words)

In [11]:
#concatenating answer,question and subject

df['full_text'] = df['subject'] + ' ' + df['question_text']+' ' + df['answer_text']
df['full_text'] = df['full_text'].str.lower() 

nan_indices = df[df['question_text'].isnull()].index
df = df.drop(nan_indices)
df['full_text']= df['full_text'].astype(str)


#REMOVING STOP WORDS EXCEPT GENDERED ONES
import nltk
from nltk.corpus import stopwords

# Download the stopwords list (if not already downloaded)
nltk.download('stopwords')

# Get the stopwords list
stop_words = set(stopwords.words('english'))

# Remove specific words from the set of stopwords
words_to_keep = {"she", "her", "hers", "herself", "she's", "she'd", "she'll"}
stop_words.difference_update(words_to_keep)

# Function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stopwords, excluding specific words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the filtered words back into a single string
    filtered_text = ' '.join(filtered_words)
    return filtered_text

# Apply the function to the 'full_text' column
df['full_text'] = df['full_text'].apply(remove_stopwords)

In [18]:
# WORD2VEC: BIGRAMS

from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df['full_text']]
phrases = Phrases(sent, min_count=5, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [19]:
# SKIP GRAM

import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count()


skipg = Word2Vec(min_count=20,
                     window=6,
                     vector_size=300,  
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     sg=1)

skipg.build_vocab(sentences, progress_per=10000)

skipg_model = skipg.train(sentences, total_examples=skipg.corpus_count, epochs=20, report_delay=1)  
#159 min

#skipg.save("path/skipg-full.model")

#Identifying most similar words to women
skipg.wv.most_similar(positive=["woman"], topn=1000)

In [24]:
# CBOW
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer


word_embed = Word2Vec(min_count=20,
                     window=6,
                     vector_size=300,  # Corrected parameter name
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)


word_embed.build_vocab(sentences, progress_per=10000)
word_embed_model = word_embed.train(sentences, total_examples=word_embed.corpus_count, epochs=20, report_delay=1)
#48 min

#word_embed.save("path/cbow-full.model")
word_embed.wv.most_similar(positive=["woman"], topn=1000)

In [16]:
#Based on words identified by word embedding models categorising questions on whetehr they pertain to women or not

import pandas as pd
df_final = df.copy()

# Define the regular expression pattern
pattern = r"female.*|girl.*|\bladies\b|\bqueen(s?)\b|\bwom(a|e)n.*|\bbride.*\b|daughter.*|\bdivorce.*\b|\b.*mother.*\b|\bhousemaid.*\b|\bhusband(s?)\b|\bmarital.*\b|\b.*marriage.*\b|\b.*married\b|\bmarry.*\b|\bmatrimonial.*\b|\bpregnan.*\b|sister(s?)|\bson(s?)\b|\bspouse(s?)\b|\bwidow.*\b|\bwife\b|\bwives\b|\babduct.*\b|\babortion(s?)\b|\bacid(s?)(\s?)attack(s?)|\b(victim(s?))(\s?)(acid(s?))\b.*hostess(es?)\b|\b.*natal.*\b|\bbreast.*\b|\bbrothel(s?)\b|\bchild(\s?)birth(s?)\b|\bchild(\s?)care(s?)\b|\bcontracept.*\b|\bcreche|\bcsection\b|\bdomestic(\s?)violence(s?)\b|\bdowry.*\b|\beve((-?)|(\s?))teasing\b|\b(first(s?)|second(s?)|third(s?))(\s?)trimester(s?)\b|\bfoetus\b|\bfoetal\b|\bfolic(\s?)acid\b|\bgang((-?)|(\s?))rape(s?)\b|\b.*gender.*\b|\bgynae.*\b|\bifa(\s?)tablet(s?)\b|\b(human(s?)|child|children|wom(a|e)n)(\s?)trafficked\b|\bimmoral(s?)traffic.*\b|\b(human(s?)|child|children|wom(a|e)n)(\s?)trafficking\b|\binfanticide(s?)\b|\binstitutional(\s?)deliver.*\b|\bkidnap.*\b|\blactati.*\b|\bmarital.*\b|\bmatern.*\b|\bmedical(\s?)terminat.*\b|\bmenstrua.*\b|\bmolest.*\b|\bnursing.*\b|\bobstetri.*\b|\bprostitut.*\b|\brape.*\b|\brapist(s?)\b|\breproducti.*\b|\bsati\b|\bsex.*\b|\bstalking\b|\bsupplementation(\s?)iron\b|\biron(\s?)supplement(s?)\b|\bsurroga.*\b|tubectomy|vasectomy|\bvictims(\s?)acid(s?)\b|a(a?)nganwadi(s?)|\baccredited(\s?)social|\banm(s?)\b|midfwife|midwives|\barsh\b|\basha(s?).*\b|\baww(s?)\b|\bbbbp\b|\bbirth(\s?)attendant(s?)\b|creche(s?)|\bdhan(a?)(\s?)lakshmi\b|dwcra|\binternal(\s?)(complaint(s?))(\s?)committee(s?)\b|\bigmsy\b|\bjssk\b|\bjsy\b|\bkiran\b|\bmatritva\b|\bmatru\b|\bmilitary(\s?)nursing(\s?)service(\s?)\b|\bmwcd\b|\bncw\b|\bnmbs\b|\bnmew\b|\bpcma\b|\bpcpndt\b|\bppmmvy\b|\bpocso\b|\bpoorna(\s?)shakti\b|\bpriyadarshini\b|\bpwdva\b|\bsavita\b|\bshishu(\s?)suraksha\b|\bsneh(a?)\b|\bssh\b|\bshort(\s?)stay(\s?)home(s?)\b|\bswadhar\b|\bswashakti\b|\bswayamsiddh(a?)\b|\bujjawala\b|\bvandana\b|\bvishaka\b|\bwwh\b|\bwomen(\s?)hostel(s?)\b|\bbeti\b|\bdai(s?)\b|\bdevdasi(s?)\b|\bjanani.*|\bksihori|\bmahila|sakhi(s?)|\banc\b|caesarean|\bgdm\b|gestational|\bhome(\s?)deliver.*\b|intrauterine|matrimonial|miscarriage(s?)|postpartum|ppiucd|pwlm|sabla|thyroid|uterus|awc(s?)|\bcmb\b|laqshya|\bmcp\b|\bmpv\b|\bsakhi\b|\bshakti\b|\bsukanya\b|\bsuman\b|\bswarnima\b|\btejaswini\b|\bwifs\b|\bwhl\b|\bwosc\b"

compiled_pattern = re.compile(pattern)
df_final['is_gender'] = df_final['full_text'].apply(lambda x: 'g' if compiled_pattern.search(x) else 'ng')

#Tells question count
df_final['is_gender'].value_counts()

In [21]:
# Filter the DataFrame for topic modelling
embed_gender = df_final[df_final['is_gender'] == 'g']
embed_gender.to_csv('Gender_New.csv', index=False)