In [1]:
# Sample lexicons (ideally load from your male_verb.csv etc.)
female_adjs = {"beautiful", "lovely", "cute", "obedient"}
male_adjs = {"brave", "strong", "dominant", "bold"}
female_roles = {"nurse", "maid", "secretary", "housewife", "daughter"}
male_roles = {"doctor", "police", "pilot", "businessman", "boss"}


In [2]:
import spacy
import pandas as pd

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
def get_gender_from_cast(name, cast_df):
    """Return gender if name is in cast dataset."""
    name = name.strip().lower()
    row = cast_df[cast_df['character'].str.lower() == name]
    return row['gender'].values[0].lower() if not row.empty else None


In [5]:
def detect_script_stereotypes_advanced(script_path, cast_df):
    text = open(script_path, 'r', encoding='utf-8').read()
    doc = nlp(text)
    results = []

    for sent in doc.sents:
        sent_text = sent.text.strip()

        for token in sent:
            # Check for adjective stereotype
            if token.pos_ == "ADJ":
                subj = [w for w in token.head.lefts if w.dep_ in ("nsubj", "nsubjpass")]
                if subj:
                    subj_text = subj[0].text
                    gender = get_gender_from_cast(subj_text, cast_df)
                    if gender == "female" and token.text.lower() in female_adjs:
                        results.append((sent_text, subj_text, gender, token.text, "Appearance bias"))
                    elif gender == "male" and token.text.lower() in male_adjs:
                        results.append((sent_text, subj_text, gender, token.text, "Achievement bias"))

            # Check for role/occupation stereotype
            if token.pos_ == "NOUN":
                if token.text.lower() in female_roles:
                    results.append((sent_text, token.text, "female", token.text, "Gendered Occupation"))
                elif token.text.lower() in male_roles:
                    results.append((sent_text, token.text, "male", token.text, "Gendered Occupation"))

            # Check for copular (is/was) patterns like "She is beautiful", "He is strong"
            if token.dep_ == "attr" and token.head.pos_ == "AUX":
                subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
                if subj:
                    subj_text = subj[0].text
                    gender = get_gender_from_cast(subj_text, cast_df)
                    if gender == "female" and token.text.lower() in female_adjs:
                        results.append((sent_text, subj_text, gender, token.text, "Appearance bias"))
                    elif gender == "male" and token.text.lower() in male_adjs:
                        results.append((sent_text, subj_text, gender, token.text, "Achievement bias"))

    return pd.DataFrame(results, columns=["sentence", "character", "gender", "keyword", "bias_type"])


In [9]:
cast_df = pd.read_csv("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\wikipedia-data\\characters.csv")  # or character.csv
bias_df = detect_script_stereotypes_advanced("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\scripts-text\\Haider.txt", cast_df)
bias_df.to_csv("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\scripts-csv\\haider_biases.csv", index=False)


In [10]:
import os
import pandas as pd


In [11]:
cast_df = pd.read_csv("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\wikipedia-data\\characters.csv")
cast_df['character'] = cast_df['character'].str.strip().str.lower()

In [12]:
output_dir = "C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\bias_reports"
os.makedirs(output_dir, exist_ok=True)

In [15]:
script_folder = "C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\scripts-text"
for file in os.listdir(script_folder):
    if file.endswith(".txt"):
        script_path = os.path.join(script_folder, file)
        movie_name = file.replace(".txt", "")
        
        # Detect bias
        bias_df = detect_script_stereotypes_advanced(script_path, cast_df)
        
        # Save results
        output_path = os.path.join(output_dir, f"{movie_name}_bias.csv")
        bias_df.to_csv(output_path, index=False)
        print(f"[✓] Bias extracted for: {movie_name}")

[✓] Bias extracted for: Haider
[✓] Bias extracted for: Highway
[✓] Bias extracted for: JabWeMet
[✓] Bias extracted for: Kaminey
[✓] Bias extracted for: Maqbool
[✓] Bias extracted for: Masaan
[✓] Bias extracted for: Neerja
[✓] Bias extracted for: Nil Battey Sannata
[✓] Bias extracted for: Pink
[✓] Bias extracted for: Queen
[✓] Bias extracted for: Raman Raghav 2_0
[✓] Bias extracted for: Rang De Basanti Script - Film Companion-min
[✓] Bias extracted for: Rockstar


In [16]:
def generate_bias_summary(bias_df, movie_name):
    from collections import Counter
    summary = {}

    summary['Movie'] = movie_name
    summary['Total Biased Lines'] = len(bias_df)
    summary['Bias Types'] = dict(Counter(bias_df['bias_type']))
    summary['Male Characters Affected'] = bias_df[bias_df['gender'] == 'male']['character'].nunique()
    summary['Female Characters Affected'] = bias_df[bias_df['gender'] == 'female']['character'].nunique()

    return pd.DataFrame([summary])


In [17]:

report_rows = []

for file in os.listdir(output_dir):
    if file.endswith("_bias.csv"):
        bias_df = pd.read_csv(os.path.join(output_dir, file))
        movie = file.replace("_bias.csv", "")
        summary = generate_bias_summary(bias_df, movie)
        report_rows.append(summary)



In [19]:
# Final Bias Feedback Report comparing all movies
final_report = pd.concat(report_rows, ignore_index=True)
final_report.to_csv("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\Bias_Feedback_Report.csv", index=False)
print("[✓] Bias Feedback Report Generated!")


[✓] Bias Feedback Report Generated!


In [20]:
def suggest_rewrite(sentence, bias_type):
    if "Appearance bias" in bias_type:
        return sentence.replace("beautiful", "confident").replace("cute", "smart")
    elif "Passive" in bias_type:
        return sentence.replace("She looks", "She asserts")
    elif "Action-oriented" in bias_type:
        return sentence.replace("He leads", "They collaborate")
    else:
        return "Consider revising: " + sentence



In [21]:
bias_df['suggested_rewrite'] = bias_df.apply(lambda row: suggest_rewrite(row['sentence'], row['bias_type']), axis=1)


In [22]:
bias_df.head()


Unnamed: 0,sentence,character,gender,keyword,bias_type,suggested_rewrite
0,Jaat \n \nRoyal family type… \nMayank \nAgar d...,boss,male,boss,Gendered Occupation,Consider revising: Jaat \n \nRoyal family type...
1,"Police mein report kara de… Haan… \nMeanwhile,...",Police,male,Police,Gendered Occupation,Consider revising: Police mein report kara de…...
2,Janardan \nJo sachcha pyaar hota hai na… shidd...,Police,male,Police,Gendered Occupation,Consider revising: Janardan \nJo sachcha pyaar...
3,She is standing with her boss Mohit in the new...,boss,male,boss,Gendered Occupation,Consider revising: She is standing with her bo...
4,The pilot track plays too loud in Jordan’s cans.,pilot,male,pilot,Gendered Occupation,Consider revising: The pilot track plays too l...


In [33]:
import os
import pandas as pd
import spacy
from openai import OpenAI

In [34]:
nlp = spacy.load("en_core_web_sm")
client = OpenAI(api_key="sk-..._XUA")

In [36]:
cast_df = pd.read_csv("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\wikipedia-data\\characters.csv")
cast_df['character'] = cast_df['character'].str.strip().str.lower()
character_gender_map = dict(zip(cast_df['character'], cast_df['gender']))

In [37]:
def load_lexicon(path):
    return set(pd.read_csv(path, header=None)[0].str.strip().str.lower())

female_adjs = load_lexicon("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\wikipedia-data\\cleaned_female_adjectives.csv")
male_adjs = load_lexicon("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\wikipedia-data\\cleaned_male_adjectives.csv")
female_verbs = load_lexicon("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\wikipedia-data\\cleaned_female_verb.csv")
male_verbs = load_lexicon("C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\wikipedia-data\\cleaned_male_verb.csv")


In [38]:
def get_gender(name):
    return character_gender_map.get(name.strip().lower())

In [39]:
def suggest_rewrite(sentence, bias_type):
    prompt = f"""You're a screenwriting consultant. The following line contains a stereotype: {bias_type}.
Please rewrite the line to be neutral and bias-free while preserving its meaning.

Biased Sentence: "{sentence}"
Rewrite:"""

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=200
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("Rewrite Error:", e)
        return sentence

In [47]:
def detect_and_rewrite_script(script_path):
    text = open(script_path, 'r', encoding='utf-8').read()
    doc = nlp(text)

    new_script = []
    summary_data = []

    for sent in doc.sents:
        sentence_text = sent.text.strip()
        bias_detected = False
        rewrite_text = sentence_text

        for token in sent:
            token_lower = token.text.lower()

            # Gender based adjective bias
            gender = get_gender(token.text)
            if token.pos_ == "ADJ":
                if gender == "female" and token_lower in female_adjs:
                    bias_type = "Appearance bias (female)"
                    rewrite_text = suggest_rewrite(sentence_text, bias_type)
                    summary_data.append((sentence_text, token.text, gender, bias_type, rewrite_text))
                    bias_detected = True
                    break
                elif gender == "male" and token_lower in male_adjs:
                    bias_type = "Power bias (male)"
                    rewrite_text = suggest_rewrite(sentence_text, bias_type)
                    summary_data.append((sentence_text, token.text, gender, bias_type, rewrite_text))
                    bias_detected = True
                    break
            if token.pos_ == "VERB":
                if gender == "female" and token_lower in female_verbs:
                    bias_type = "Passive role (female)"
                    rewrite_text = suggest_rewrite(sentence_text, bias_type)
                    summary_data.append((sentence_text, token.text, gender, bias_type, rewrite_text))
                    bias_detected = True
                    break
                elif gender == "male" and token_lower in male_verbs:
                    bias_type = "Action role (male)"
                    rewrite_text = suggest_rewrite(sentence_text, bias_type)
                    summary_data.append((sentence_text, token.text, gender, bias_type, rewrite_text))
                    bias_detected = True
                    break
        new_script.append(rewrite_text if bias_detected else sentence_text)

    return new_script, pd.DataFrame(summary_data, columns=["original_sentence", "token", "gender", "bias_type", "rewrite"])


In [50]:
def process_all_scripts():
    os.makedirs("rewrites", exist_ok=True)
    os.makedirs("summaries", exist_ok=True)

    scripts_dir = "C:\\Users\\prash\\docu3c\\Bollywood-Data-master\\scripts-text"
    for script_file in os.listdir(scripts_dir):
        if script_file.endswith(".txt"):
            movie_name = script_file.replace(".txt", "")
            print(f"Processing {movie_name}...")

            script_path = os.path.join(scripts_dir, script_file)
            new_script, summary_df = detect_and_rewrite_script(script_path)

            # Save outputs
            with open(f"rewrites/biasless_{movie_name}.txt", 'w', encoding='utf-8') as f_out:
                f_out.write("\n".join(new_script))

            summary_df.to_csv(f"summaries/{movie_name}_bias_summary.csv", index=False)

    print("✅ All scripts processed and rewritten!")


In [51]:
process_all_scripts()

Processing Haider...
Processing Highway...
Processing JabWeMet...
Processing Kaminey...
Processing Maqbool...
Processing Masaan...
Processing Neerja...
Processing Nil Battey Sannata...
Processing Pink...
Processing Queen...
Processing Raman Raghav 2_0...
Processing Rang De Basanti Script - Film Companion-min...
Processing Rockstar...
✅ All scripts processed and rewritten!
