In [4]:
import fasttext
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import numpy as np
from gensim.models.fasttext import load_facebook_model
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from difflib import SequenceMatcher
import shutil

marker = "###################################"

# Combined language-specific configurations
languages = {
    'en': {
        'full_name': 'English', 
        'model_path': 'models/cc.en.300.bin',
        'exclude_words': ["man", "woman", "Phil", "marv", "ole", "owld", "utd"],
        # "particuler", "legendry" -- archaic spellings
        'genders': ['man', 'woman'],
        'determiners': ['the'],
        'personhood_word':'person',
        'depersonalized_genders':['masculinity', 'femininity'],
    },
    'es': {
        'full_name': 'Spanish', 
        'model_path': 'models/cc.es.300.bin',
        'exclude_words': ["hombre", "mujer"],
        'genders': ['hombre', 'mujer'],
        'determiners': ['el', 'la'],
        'personhood_word':'persona',
        'depersonalized_genders':['masculinidad', 'femininidad'],
    },
    'de': {
        'full_name': 'German', 
        'model_path': 'models/cc.de.300.bin',
        'exclude_words': ["Mann", "Frau", "mfG", "ein"],
        'genders': ['Mann', 'Frau'],
        'determiners': ['der', 'die', 'das'],
        'personhood_word':'Individuum',
        'depersonalized_genders':['Männlichkeit', 'Weiblichkeit'],
    }
}

columns = {
    'masculine':'masculine_score',
    'feminine':'feminine_score',
}

# Auto-generating parquet_paths based on languages
parquet_paths = {lang: f"adjectives/{lang}_adjectives.parquet" for lang in languages.keys()}

targets = ['masculine', 'feminine']

nouns_df = pd.read_csv('nouns.csv')

def load_model(language, method="normal"):
    """
    Loads model for the specified language to memory.
    params:
        language: (string) two letter language code
        method: (string) 'facebook' or 'normal'
    """
    print(f'Loading model for {language} with method: {method}')
    model_path = languages[language]['model_path']
    print(model_path)
    if method == 'normal': 
        model = fasttext.load_model(model_path)
    else:
        model = load_facebook_model(model_path)
    print(f'Finished loading model for {language} with method: {method}')
    return model

models = {lang: load_model(lang, method='normal') for lang in languages.keys()}

Loading model for es with method: normal
models/cc.es.300.bin
Finished loading model for es with method: normal
Loading model for de with method: normal
models/cc.de.300.bin
Finished loading model for de with method: normal




In [96]:
# Base functions
def cossim(vec1, vec2):
    """Return cosine similarity between vec1 and vec2"""
    dot_product = sum(a*b for a, b in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum([val**2 for val in vec1]))
    magnitude2 = math.sqrt(sum([val**2 for val in vec2]))
    return dot_product / (magnitude1 * magnitude2)

def euclid(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.linalg.norm(vec1 - vec2)

def get(model, word):
    """Return word embedding for word as in model"""
    return model.get_word_vector(word)

def load_dataframe(file_path):
    # Check the file extension
    _, file_extension = os.path.splitext(file_path)

    # Load the file into a DataFrame based on its extension
    if file_extension == '.csv':
        df = pd.read_csv(file_path)
    elif file_extension == '.parquet':
        df = pd.read_parquet(file_path)
    else:
        raise ValueError("Unsupported file format")

    return df

def fetch_html_content(url):
    """Fetch HTML content from a given URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

# Wiktionary crawling
def parse_adjectives(soup):
    """Parse and extract adjectives from BeautifulSoup object."""
    mw_category_groups = soup.find_all(class_="mw-category-group")
    adjectives = []
    for group in mw_category_groups:
        li_tags = group.find_all('li')
        for li in li_tags:
            adjective = li.get_text()
            if not any(char.isdigit() for char in adjective) and " " not in adjective and "-" not in adjective and "+" not in adjective and "&" not in adjective and "'" not in adjective and "." not in adjective and "(" not in adjective:
                adjectives.append(adjective)
    return adjectives

def find_next_page_url(soup):
    """Find the URL of the next page."""
    next_page_link = soup.find("a", string="next page")
    return 'https://en.wiktionary.org' + next_page_link.get('href') if next_page_link else None

def extract_adjectives(language, url, max_pages=None):
    """Extract adjectives from Wiktionary for a given language."""
    all_adjectives = []
    page_count = 0
    while url and (max_pages is None or page_count < max_pages):
        html_content = fetch_html_content(url)
        soup = BeautifulSoup(html_content, 'html.parser')
        adjectives = parse_adjectives(soup)
        all_adjectives.extend(adjectives)
        url = find_next_page_url(soup)
        page_count += 1
    return all_adjectives

def save_adjectives_to_parquet(adjectives, language_code, file_path):
    """Save adjectives to a parquet file."""
    df = pd.DataFrame(adjectives, columns=['Adjective'])
    df['Language'] = language_code
    df.to_parquet(file_path, index=False)

def parse_nouns(soup):
    """Parse and extract nouns from BeautifulSoup object."""
    mw_category_groups = soup.find_all(class_="mw-category-group")
    nouns = []
    for group in mw_category_groups:
        li_tags = group.find_all('li')
        for li in li_tags:
            noun = li.get_text()
            if not any(char.isdigit() for char in noun) and " " not in noun and "-" not in noun and "+" not in noun and "&" not in noun and "'" not in noun and "." not in noun and "(" not in noun:
                nouns.append(noun)
    return nouns

def extract_nouns(language, url, max_pages=None):
    """Extract nouns from Wiktionary for a given language."""
    all_nouns = []
    page_count = 0
    while url and (max_pages is None or page_count < max_pages):
        html_content = fetch_html_content(url)
        soup = BeautifulSoup(html_content, 'html.parser')
        nouns = parse_nouns(soup)  # Need to implement parse_nouns similar to parse_adjectives
        all_nouns.extend(nouns)
        url = find_next_page_url(soup)
        page_count += 1
    return all_nouns

def save_nouns_to_parquet(nouns, language_code, file_path):
    """Save nouns to a parquet file with language and grammatical gender."""
    df = pd.DataFrame(nouns, columns=['Noun', 'Gender'])
    df['Language'] = language_code
    df.to_parquet(file_path, index=False)

# Populate adjective list with gender similarity data
def calculate_adjective_similarities(language_code, method=cossim):
    """Calculate gender-related similarities for adjectives in a given language."""
    # Load the Parquet file into a DataFrame
    parquet_file_path = parquet_paths[language_code]
    df = pd.read_parquet(parquet_file_path)

    # Load the word embedding model
    language_data = languages[language_code]
    model = models[language_code]

    # Initialize columns for similarities
    df['masculine_similarity'] = 0.0
    df['feminine_similarity'] = 0.0
    df['exclusive_masculine_similarity'] = 0.0
    df['exclusive_feminine_similarity'] = 0.0
    df['depersonalized_masculine_similarity'] = 0.0
    df['depersonalized_feminine_similarity'] = 0.0

    # Get target word embeddings
    masculine_target = get(model, language_data['genders'][0])
    feminine_target = get(model, language_data['genders'][1])
    neuter_target = get(model, language_data['personhood_word'])

    # Calculate cosine similarities
    for index, row in df.iterrows():
        word_vec = get(model, row['Adjective'])

        # Regular similarities
        df.at[index, 'masculine_similarity'] = method(word_vec, masculine_target)
        df.at[index, 'feminine_similarity'] = method(word_vec, feminine_target)
        df.at[index, 'neuter_similarity'] = method(word_vec, neuter_target)
        df.at[index, 'depersonalized_masculine_similarity'] = method(word_vec, get(models[language_code], language_data['depersonalized_genders'][1]))
        df.at[index, 'depersonalized_feminine_similarity'] = method(word_vec, get(models[language_code], language_data['depersonalized_genders'][0]))

    # Save the updated DataFrame back to Parquet
    df.to_parquet(parquet_file_path)

def select_top_words(language_code, method, num_rows=1000, semantic_differential_vectors='gender1-gender2'):
    # Load the Parquet file into a DataFrame
    parquet_file_path = parquet_paths[language_code]
    df = pd.read_parquet(parquet_file_path)
    model = models[language_code]

    # Access language-specific data
    language_data = languages[language_code]

    # debug lines vvvv
    # method = 'cosine_similarity'
    # semantic_differential_vectors = 'gender-Gender'
    # ^^^^ debug lines
    print('############### TOPWORDS CALLED #######################################################$$$$$$$$$$$$$$$$$$')

    # Implement the semantic differential method
    if method == 'semantic_differential':
        print('############### SEMANTIC DIFFERNTIAL #######################################################$$$$$$$$$$$$$$$$$$')
        if semantic_differential_vectors == 'gender1-gender2':
            print('############### GENDER1-GENDER2 #######################################################$$$$$$$$$$$$$$$$$$')
            df['masculine_score'] = df['masculine_similarity'] - df['feminine_similarity']
            df['feminine_score'] = df['feminine_similarity'] - df['masculine_similarity']
        elif semantic_differential_vectors == 'gender-person':
            print('############### GENDER-PERSON #######################################################$$$$$$$$$$$$$$$$$$')
            df['masculine_score'] = df['masculine_similarity'] - df['neuter_similarity']
            df['feminine_score'] = df['feminine_similarity'] - df['neuter_similarity']
        elif semantic_differential_vectors == 'gender-Gender':
            print('############### GENDER-GENDER #######################################################$$$$$$$$$$$$$$$$$$')
            df['masculine_score'] = df['masculine_similarity'] - df['depersonalized_feminine_similarity']
            df['feminine_score'] = df['feminine_similarity'] - df['depersonalized_masculine_similarity']
    if method == 'cosine_similarity':
        print('############### COSINE SIMILARITY #######################################################$$$$$$$$$$$$$$$$$$')
        df['masculine_score'] = df['masculine_similarity']
        df['feminine_score'] = df['feminine_similarity']

    # Filter out excluded words
    exclude_list = language_data['exclude_words']
    df = df[~df['Adjective'].isin(exclude_list)]

    # Initialize empty DataFrames for selected words
    selected_masculine = pd.DataFrame(columns=df.columns)
    selected_feminine = pd.DataFrame(columns=df.columns)

    while len(selected_masculine) < num_rows or len(selected_feminine) < num_rows:
        if len(selected_masculine) < num_rows:
            top_masculine = df.sort_values(by=columns['masculine'], ascending=False).head(num_rows * 2)
            top_masculine = top_masculine[~top_masculine['Adjective'].isin(selected_feminine['Adjective'])].head(num_rows - len(selected_masculine))
            selected_masculine = pd.concat([selected_masculine, top_masculine])

        if len(selected_feminine) < num_rows:
            top_feminine = df.sort_values(by=columns['feminine'], ascending=False).head(num_rows * 2)
            top_feminine = top_feminine[~top_feminine['Adjective'].isin(selected_masculine['Adjective'])].head(num_rows - len(selected_feminine))
            selected_feminine = pd.concat([selected_feminine, top_feminine])

    # Save the selected words to Parquet files
    masculine_file_path = f'adjectives/{language_code}_masculine_adjectives.parquet'
    feminine_file_path = f'adjectives/{language_code}_feminine_adjectives.parquet'
    selected_masculine.to_parquet(masculine_file_path, index=False)
    selected_feminine.to_parquet(feminine_file_path, index=False)

    return selected_masculine, selected_feminine

def duplicate_spanish_adjectives(df, association):
    """
    Adds an 'Alternate Form' column to the DataFrame of Spanish adjectives, 
    with the adjective from the 'Adjective' column having the opposite gender ending.
    """

    # df['Alternate Form'] = df['Adjective']
    print(f"Added 'Alternate Form' to Spanish {association} adjectives parquet file")

    for index, row in df.iterrows():
        adjective = row['Adjective']
        if adjective.endswith('o'):  # masculine to feminine
            df.at[index, 'Alternate Form'] = adjective[:-1] + 'a'
            print(f"Added alternate form {adjective[:-1] + 'a'} for adjective: {adjective}")
        elif adjective.endswith('a'):  # feminine to masculine
            df.at[index, 'Alternate Form'] = adjective[:-1] + 'o'
            print(f"Added alternate form {adjective[:-1] + 'o'} for adjective: {adjective}")

    masculine_file_path = f'adjectives/es_masculine_adjectives.parquet'
    feminine_file_path = f'adjectives/es_feminine_adjectives.parquet'  
    df.to_parquet(masculine_file_path if association == 'masculine' else feminine_file_path, index=False)

    return df

def calculate_similarity(model, words, target_words, ref_group_label, language, ref_association, target_group):
    """
    Calculates the cosine similarity between word vectors and target vectors.
    """
    results = []
    for word in words:
        word_vec = get(model, word)
        for target_word in target_words:
            target_vec = get(model, target_word)
            similarity = cossim(word_vec, target_vec)
            results.append({
                'LANGUAGE': language,
                'REFERENCE GROUP': ref_group_label,
                'REFERENCE ASSOCIATION': ref_association,
                'REFERENCE WORD': word,
                'TARGET GROUP': target_group,
                'TARGET WORD': target_word,
                'COSINE SIMILARITY': similarity
            })
    return pd.DataFrame(results)

def create_control_test_dataframe(lang_code, nouns_df, model):
    print(f"Creating control test dataframe for language: {lang_code}")
    control_test_data = []

    # Process nouns
    nouns_lang_df = nouns_df[nouns_df['LANGUAGE'] == lang_code]
    for noun_gender in nouns_lang_df['ASSOCIATION/GRAMMATICAL GENDER'].unique():
        print(f"Processing nouns for gender: {noun_gender}")
        nouns = nouns_lang_df[nouns_lang_df['ASSOCIATION/GRAMMATICAL GENDER'] == noun_gender]['WORD'].tolist()
        for target_group in ['genders', 'determiners']:
            print(f"Calculating similarities for nouns with {target_group}")
            target_words = languages[lang_code][target_group]
            for target_word in target_words:
                data = calculate_similarity(model, nouns, [target_word], 'nouns', lang_code, noun_gender, target_group)
                control_test_data.append(data)

    # Process adjectives for each gender association
    for adj_association in targets:
        print(f"Processing {adj_association} adjectives")
        adjectives_df = pd.read_parquet(f'adjectives/{lang_code}_{adj_association}_adjectives.parquet')
        adjectives = adjectives_df['Adjective'].tolist()
        for target_group in ['genders', 'determiners']:
            print(f"Calculating similarities for {adj_association} adjectives with {target_group}")
            target_words = languages[lang_code][target_group]
            for target_word in target_words:
                data = calculate_similarity(model, adjectives, [target_word], 'adjectives', lang_code, adj_association, target_group)
                control_test_data.append(data)

            if lang_code == 'es':
                alternate_forms = adjectives_df['Alternate Form'].dropna().tolist()
                for target_word in target_words:
                    for alternate_form in alternate_forms:
                        if alternate_form:  # Check if alternate form is not None
                            data = calculate_similarity(model, [alternate_form], [target_word], 'adjectives', lang_code, adj_association, target_group)
                            control_test_data.append(data)
    combined_data = pd.concat(control_test_data, ignore_index=True)
    print("Columns in control_data DataFrame:", combined_data.columns.tolist())
    print(f"Control test dataframe created for language: {lang_code}, Rows: {combined_data.shape[0]}")
    print(f"Control test dataframe created for language: {lang_code}, Rows: {combined_data.shape[0]}")
    return combined_data

def create_experimental_test_dataframe(lang_code, nouns_df, model, use_groupby=False):
    print(f"Creating experimental test dataframe for language: {lang_code}")
    experimental_test_data = []

    nouns_lang_df = nouns_df[nouns_df['LANGUAGE'] == lang_code]

    for adj_association in ['masculine', 'feminine']:
        adjectives_df = pd.read_parquet(f'adjectives/{lang_code}_{adj_association}_adjectives.parquet')
        adjectives = adjectives_df['Adjective'].tolist()

        if lang_code == 'es':
            alternate_forms = adjectives_df['Alternate Form'].dropna().tolist()

            for noun_gender in nouns_lang_df['ASSOCIATION/GRAMMATICAL GENDER'].unique():
                nouns = nouns_lang_df[nouns_lang_df['ASSOCIATION/GRAMMATICAL GENDER'] == noun_gender]['WORD'].tolist()
                for noun in nouns:
                    for alternate_form in alternate_forms:  # Include alternate forms in the loop
                        if alternate_form:  # Check if alternate form is not None
                            similarity = cossim(model.get_word_vector(noun), model.get_word_vector(alternate_form))
                            experimental_test_data.append({
                                'LANGUAGE': lang_code,
                                'GRAMMATICAL GENDER OF NOUN': noun_gender,
                                'NOUN': noun,
                                'ADJECTIVE': alternate_form,
                                'COSINE SIMILARITY': similarity,
                                'GENDER ASSOCIATION OF ADJECTIVE': adj_association
                            })

        for noun_gender in nouns_lang_df['ASSOCIATION/GRAMMATICAL GENDER'].unique():
            nouns = nouns_lang_df[nouns_lang_df['ASSOCIATION/GRAMMATICAL GENDER'] == noun_gender]['WORD'].tolist()
            for noun in nouns:
                for adjective in adjectives:
                    similarity = cossim(model.get_word_vector(noun), model.get_word_vector(adjective))
                    experimental_test_data.append({
                        'LANGUAGE': lang_code,
                        'GRAMMATICAL GENDER OF NOUN': noun_gender,
                        'NOUN': noun,
                        'ADJECTIVE': adjective,
                        'COSINE SIMILARITY': similarity,
                        'GENDER ASSOCIATION OF ADJECTIVE': adj_association
                    })

    combined_data = pd.DataFrame(experimental_test_data)
    
    if use_groupby:
        combined_data = combined_data.groupby(['LANGUAGE', 'GRAMMATICAL GENDER OF NOUN', 'NOUN', 'GENDER ASSOCIATION OF ADJECTIVE'])['COSINE SIMILARITY'].mean().reset_index()

    print(f"Experimental test dataframe created for language: {lang_code}, Rows: {combined_data.shape[0]}")
    return combined_data

def plot_and_save(df, title, filename, plot_type, ref_group, font_size):
    print(f"Plotting: {title}")
    if df.empty:
        print(f"No data to plot for {title}")
        return
    
    mpl.rcParams['font.size'] = font_size  # Adjust this value as needed
    plt.figure(figsize=(10, 6))
    order = None
    hue_order = None

    try:
        if ref_group == 'nouns':
            # For control tests with nouns as the reference group
            x_col = 'REFERENCE ASSOCIATION'  # Column in DataFrame
            hue_col = 'TARGET WORD'
            x_label = 'Grammatical Gender of Noun'  # Label for x-axis
            legend_title = 'Target Word'
            if 'English' not in title:  
                order = targets
        elif ref_group == 'adjectives':
            # For control tests with adjectives as the reference group
            x_col = 'REFERENCE ASSOCIATION'
            hue_col = 'TARGET WORD'
            x_label = 'Gender Association of Adjective'
            legend_title = 'Target Word'
            order = targets
        else:
            # For experimental tests
            x_col = 'GRAMMATICAL GENDER OF NOUN'
            hue_col = 'GENDER ASSOCIATION OF ADJECTIVE'
            x_label = 'Grammatical Gender of Noun'
            legend_title = 'Gender Association of Adjective'
            if 'English' not in title:  
                order = targets
            hue_order = targets 

        if plot_type == 'box':
            sns.boxplot(x=x_col, y='COSINE SIMILARITY', hue=hue_col, data=df, order=order, hue_order=hue_order)
        elif plot_type == 'strip':
            sns.stripplot(x=x_col, y='COSINE SIMILARITY', hue=hue_col, data=df, dodge=True, order=order, hue_order=hue_order)

        plt.title(title)
        plt.xlabel(x_label)
        plt.ylabel('Cosine Similarity')
        plt.legend(title=legend_title)
        plt.tight_layout()
        plt.savefig(filename)
        print(f"Plot saved: {filename}")
    except Exception as e:
        print(f"{marker}Error in plotting: {e}")
    finally:
        plt.close()

def plot_adjective_targets_nouns(lang_code, control_data, nouns_df, model):
    if lang_code == 'en':
        return

    def generate_plot(data_dict, plot_title, file_name):
        plot_data = pd.DataFrame(data_dict)
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
        plt.title(plot_title)
        plt.xlabel('Similarity to Gender Word')
        plt.ylabel('Avg Similarity to Gender Nouns')
        plt.legend(title='Adjective Association')
        plt.savefig(file_name)
        plt.close()

    def process_adjectives(adj_association, gender_word):
        data_dict = {'Adjective': [], 'Association': [], 'Similarity to Gender Word': [], 'Avg Similarity to Gender Nouns': []}
        adjective_data = control_data[(control_data['REFERENCE GROUP'] == 'adjectives') & (control_data['REFERENCE ASSOCIATION'] == adj_association) & (control_data['TARGET GROUP'] == 'genders')]

        for adj in adjective_data['REFERENCE WORD'].unique():
            sim_to_gender_word = cossim(model.get_word_vector(adj), model.get_word_vector(gender_word))
            avg_similarity = np.mean([cossim(model.get_word_vector(adj), model.get_word_vector(noun)) for noun in gender_nouns])

            data_dict['Adjective'].append(adj)
            data_dict['Association'].append(adj_association)
            data_dict['Similarity to Gender Word'].append(sim_to_gender_word)
            data_dict['Avg Similarity to Gender Nouns'].append(avg_similarity)
        
        return data_dict

    for target_gender in ['masculine', 'feminine']:
        for gender_word_vec in ['man', 'woman']:
            gender_word = languages[lang_code]['genders'][0] if gender_word_vec == 'man' else languages[lang_code]['genders'][1]
            gender_nouns = nouns_df[(nouns_df['LANGUAGE'] == lang_code) & (nouns_df['ASSOCIATION/GRAMMATICAL GENDER'] == target_gender)]['WORD'].tolist()

            # Regular plots
            for adj_association in ['masculine', 'feminine']:
                data_dict = process_adjectives(adj_association, gender_word)
                plot_title = f"{lang_code} Adjectives: {gender_word.capitalize()} vs {target_gender.capitalize()} Nouns"
                file_name = f"plots/scatters/{lang_code}_{gender_word_vec}_{target_gender}_adjectives_vs_nouns.png"
                generate_plot(data_dict, plot_title, file_name)

            # Additional plots for Spanish with averaged forms
            if lang_code == 'es':
                avg_data_dict = {'Adjective': [], 'Association': [], 'Similarity to Gender Word': [], 'Avg Similarity to Gender Nouns': []}
                adjs_processed = set()

                for adj_association in ['masculine', 'feminine']:
                    data_dict = process_adjectives(adj_association, gender_word)

                    for adj, sim, avg_sim in zip(data_dict['Adjective'], data_dict['Similarity to Gender Word'], data_dict['Avg Similarity to Gender Nouns']):
                        if adj.endswith('o') or adj.endswith('a'):
                            base_adj = adj[:-1]
                            alt_form = base_adj + ('o' if adj.endswith('a') else 'a')

                            if alt_form in data_dict['Adjective'] and adj not in adjs_processed:
                                alt_sim = data_dict['Similarity to Gender Word'][data_dict['Adjective'].index(alt_form)]
                                alt_avg_sim = data_dict['Avg Similarity to Gender Nouns'][data_dict['Adjective'].index(alt_form)]

                                avg_data_dict['Adjective'].append(base_adj)
                                avg_data_dict['Association'].append(adj_association)
                                avg_data_dict['Similarity to Gender Word'].append(np.mean([sim, alt_sim]))
                                avg_data_dict['Avg Similarity to Gender Nouns'].append(np.mean([avg_sim, alt_avg_sim]))

                                adjs_processed.add(adj)
                                adjs_processed.add(alt_form)
                        elif adj not in adjs_processed:
                            avg_data_dict['Adjective'].append(adj)
                            avg_data_dict['Association'].append(adj_association)
                            avg_data_dict['Similarity to Gender Word'].append(sim)
                            avg_data_dict['Avg Similarity to Gender Nouns'].append(avg_sim)

                            adjs_processed.add(adj)

                plot_title = f"{lang_code} Adjectives (Avg'd Forms): {gender_word.capitalize()} vs {target_gender.capitalize()} Nouns"
                file_name = f"plots/scatters/{lang_code}_{gender_word_vec}_{target_gender}_adjectives_avgd_forms_vs_nouns.png"
                generate_plot(avg_data_dict, plot_title, file_name)


def write_statistics_to_report(df, title, report_filename):
    print(f"Writing statistics to report: {report_filename}")
    with open(report_filename, 'w') as report_file:
        if df.empty:
            report_file.write(f"No data available for {title}\n\n")
            return

        if 'REFERENCE GROUP' in df.columns:
            group_cols = ['LANGUAGE', 'REFERENCE GROUP', 'TARGET GROUP']
        elif 'GRAMMATICAL GENDER OF NOUN' in df.columns:
            group_cols = ['LANGUAGE', 'GRAMMATICAL GENDER OF NOUN', 'GENDER ASSOCIATION OF ADJECTIVE']
        else:
            report_file.write("Unexpected DataFrame structure.\n")
            return

        stats = df.groupby(group_cols)['COSINE SIMILARITY'].describe()
        report_file.write(f"Statistics for {title}:\n{stats}\n\n")
    print(f"Report written: {report_filename}")

def find_adjective_definition(adjective):
    print(f'Finding definition for word: {adjective}')
    # Adjust URL for English definitions regardless of the adjective's language
    url = f"https://en.wiktionary.org/wiki/{adjective}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Attempt to find the definition section for adjectives
        definition_section = soup.find('span', {'class': 'mw-headline'}, text='Adjective')
        if definition_section:
            definition_list = definition_section.find_next('ol')
            if definition_list:
                first_item = definition_list.find('li')
                if first_item:
                    definition = first_item.get_text(separator=' ', strip=True).split('.')[0]
                    return definition

    except requests.HTTPError as e:
        print(f"Error retrieving page for {adjective}: {e}")
    except Exception as e:
        print(f"Error processing {adjective}: {e}")

    return "Definition not found"

def remove_adjective_duplicates(lang_code, columns):
    # Load CSV files for masculine and feminine adjectives
    masculine_csv = f'adjectives/{lang_code}_masculine_adjectives.csv'
    feminine_csv = f'adjectives/{lang_code}_feminine_adjectives.csv'
    df_masculine = pd.read_csv(masculine_csv)
    df_feminine = pd.read_csv(feminine_csv)

    # Additional handling for Spanish adjectives
    if lang_code == 'es':
        for adj in df_masculine['Adjective']:
            if adj.endswith('o'):
                adj_root = adj[:-1]
                # Check if feminine form exists
                feminine_form = adj_root + 'a'
                if feminine_form in df_feminine['Adjective'].values:
                    # Compare and remove based on score
                    masculine_score = df_masculine.loc[df_masculine['Adjective'] == adj, columns['masculine']].values[0]
                    feminine_score = df_feminine.loc[df_feminine['Adjective'] == feminine_form, columns['feminine']].values[0]
                    if masculine_score > feminine_score:
                        df_feminine = df_feminine[df_feminine['Adjective'] != feminine_form]
                    elif feminine_score > masculine_score:
                        df_masculine = df_masculine[df_masculine['Adjective'] != adj]

    # Continue with the original functionality for non-Spanish languages or non-gender inflected adjectives
    common_adjectives = set(df_masculine['Adjective']).intersection(df_feminine['Adjective'])
    for adj in common_adjectives:
        masculine_score = df_masculine.loc[df_masculine['Adjective'] == adj, columns['masculine']].values[0]
        feminine_score = df_feminine.loc[df_feminine['Adjective'] == adj, columns['feminine']].values[0]
        if masculine_score > feminine_score:
            df_feminine = df_feminine[df_feminine['Adjective'] != adj]
        elif feminine_score > masculine_score:
            df_masculine = df_masculine[df_masculine['Adjective'] != adj]

    # Save the updated CSV files
    df_masculine.to_csv(masculine_csv, index=False)
    df_feminine.to_csv(feminine_csv, index=False)
    print(f"Updated CSV files for {lang_code}: removed duplicates with lower scores.")

def create_adjective_review_csv(parquet_file):
    file_name = os.path.basename(parquet_file)
    language_code, gender = file_name.split('_')[:2]
    print(f'Creating review sheet for language: {language_code}')

    df = pd.read_parquet(parquet_file)
    df['Definition'] = df['Adjective'].apply(lambda x: find_adjective_definition(x))

    # Determine which similarity scores to include based on gender
    similarity_score_col = f"{gender.lower()}_similarity"
    score_col = f"{gender.lower()}_score"

    reviews_dir = 'adjectives'
    df_sorted = df.sort_values(columns[gender], ascending=False)

    csv_file_path = os.path.join(reviews_dir, file_name.replace('.parquet', '.csv'))
    df_sorted.to_csv(csv_file_path, index=False)
    print(f"File saved as {csv_file_path}")

def find_minimum_length(languages, genders, unallowed_words):
    min_length = float('inf')
    for lang_code in languages:
        for gender in genders:
            parquet_file = f'adjectives/{lang_code}_{gender}_adjectives.parquet'
            df = pd.read_parquet(parquet_file)
            df = df[~df['Adjective'].isin(unallowed_words)]  # Remove unallowed words
            min_length = min(min_length, len(df))
    return min_length

def remove_unwanted_adjectives(csv_file, allowed_words, unallowed_words, markers, min_length, gender):
    # Load data from CSV file
    csv_df = pd.read_csv(csv_file)

    # Extract language code from filename
    lang_code = csv_file[:2]

    # Convert 'Adjective' column to string type for consistency
    csv_df['Adjective'] = csv_df['Adjective'].astype(str)

    # Preprocess unallowed_words for Spanish adjectives
    if lang_code == 'es':
        processed_unallowed = set()
        for word in unallowed_words:
            if word.endswith('o') or word.endswith('a'):
                processed_unallowed.add(word[:-1])
            else:
                processed_unallowed.add(word)
        unallowed_words = processed_unallowed

    # Filter: Remove unallowed words
    csv_df = csv_df[~csv_df['Adjective'].apply(lambda x: x[:-1] if (x.endswith('o') or x.endswith('a')) else x).isin(unallowed_words)]

    # Filter: Remove words with unwanted markers, unless they are in allowed words
    csv_df = csv_df[(~csv_df['Definition'].apply(lambda x: any(marker in x for marker in markers)) | csv_df['Adjective'].isin(allowed_words))]

    # Sort and Trim: Keep only the top n rows based on score
    score_col = columns[gender]
    csv_df = csv_df.sort_values(by=score_col, ascending=False).head(min_length)

    # Save the updated DataFrame back to CSV and Parquet
    csv_df.to_csv(csv_file, index=False)
    parquet_file = csv_file.replace('.csv', '.parquet')
    csv_df.to_parquet(parquet_file, index=False)

    print(f"Updated DataFrame saved as {csv_file} and {parquet_file}")

    return csv_df

def create_adjective_stimulus_files(csv_file):
    # Define the directory where the copied files will be stored
    target_dir = 'adjectives/stimulus_files'
    os.makedirs(target_dir, exist_ok=True)

    # Define the path for the target file
    target_file_path = os.path.join(target_dir, os.path.basename(csv_file))

    # Copy the file
    shutil.copyfile(csv_file, target_file_path)
    print(f"Copied {csv_file} to {target_file_path}")

In [97]:
def run(procedures, nouns_file, adjective_gender_association_method, top_n_adjectives, load_method, models, plot_type, use_groupby, semantic_differential_vectors, remove_adjectives_with_markers, unallowed_words, allowed_words, font_size):
    nouns_df = load_dataframe(nouns_file)
    min_length = top_n_adjectives
    if adjective_gender_association_method == 'cosine_similarity':
        for gender in columns.keys(): columns[gender] = f'{gender}_similarity'
    elif adjective_gender_association_method == 'semantic_differential':
        for gender in columns.keys(): columns[gender] = f'{gender}_score'
    if procedures['load_models']:
        # Load models for each language
        models = {lang: load_model(lang, load_method) for lang in languages}

    # Extract and save nouns and adjectives from Wiktionary if required
    if procedures['crawl_wiktionary_nouns'] or procedures['crawl_wiktionary_adjectives']:
        for lang_code, lang_data in languages.items():
            if procedures['crawl_wiktionary_nouns']:
                # Nouns extraction and saving
                nouns_url = f'https://en.wiktionary.org/wiki/Category:{lang_data["full_name"]}_nouns'
                nouns = extract_nouns(lang_code, nouns_url)
                save_nouns_to_parquet(nouns, lang_code, f'nouns/{lang_code}_nouns.parquet')

            if procedures['crawl_wiktionary_adjectives']:
                # Adjectives extraction and saving
                adjectives_url = f'https://en.wiktionary.org/wiki/Category:{lang_data["full_name"]}_adjectives'
                adjectives = extract_adjectives(lang_code, adjectives_url)
                save_adjectives_to_parquet(adjectives, lang_code, f'adjectives/{lang_code}_adjectives.parquet')

    if procedures['calculate_adjective_similarities']:
        # Populate adjective list with gender similarity data
        for lang_code in languages.keys():
            print(f"Performing gender similarity calculations for {languages[lang_code]['full_name']}...")
            calculate_adjective_similarities(lang_code)
            print(f"Calculations completed for {languages[lang_code]['full_name']}.")

    if procedures['select_top_adjectives']:
        # Select the top n most masculine or feminine adjectives
        for lang_code in languages.keys():
            masculine, feminine = select_top_words(lang_code, num_rows=top_n_adjectives, method=adjective_gender_association_method, semantic_differential_vectors=semantic_differential_vectors)
            if lang_code == 'es':
                masculine, feminine = duplicate_spanish_adjectives(masculine, 'masculine'), duplicate_spanish_adjectives(feminine, 'feminine')
            print(f"Selected top adjectives for {languages[lang_code]['full_name']}: Masculine: {len(masculine)}, Feminine: {len(feminine)}")
    
    if procedures['remove_adjective_duplicates']:
        for lang_code in languages.keys():
            remove_adjective_duplicates(lang_code, columns)
        
    # Turn Parquet files into csv files for manual inspection or readability during communication
    if procedures['adjective_definition_review']:
        for lang_code in languages.keys():
            for gender in targets:
                create_adjective_review_csv(f'adjectives/{lang_code}_{gender}_adjectives.parquet')

    if procedures['remove_unwanted_adjectives']:
        for lang_code in ['en', 'es', 'de']:
            for gender in ['masculine', 'feminine']:
                csv_file = f'adjectives/{lang_code}_{gender}_adjectives.csv'
                parquet_file = f'adjectives/{lang_code}_{gender}_adjectives.parquet'
                min_length = find_minimum_length(languages, targets, unallowed_words)
                remove_unwanted_adjectives(csv_file, allowed_words, unallowed_words, remove_adjectives_with_markers, min_length, gender)

    if procedures['create_stimulus_files']:
        min_length = find_minimum_length(languages, targets, unallowed_words)

        for lang_code in languages:
            for gender in targets:
                csv_file = f'adjectives/{lang_code}_{gender}_adjectives.csv'
                create_adjective_stimulus_files(csv_file)

            # Iterate through each language
    for lang_code, lang_data in languages.items():
        print(f"Processing language: {lang_data['full_name']}")
        model = models[lang_code]

        # Conduct Control Tests
        if procedures['conduct_control_tests']:
            print(f"Conducting control tests for {lang_data['full_name']}")
            control_data_dir = f'control_data/{lang_code}'
            os.makedirs(control_data_dir, exist_ok=True)

            control_data = create_control_test_dataframe(lang_code, nouns_df, model)
            control_data.to_parquet(f'{control_data_dir}/{lang_code}_control_data.parquet')

            for ref_group in ['nouns', 'adjectives']:
                for target_group in ['genders', 'determiners']:
                    plot_title = f'{lang_data["full_name"]}: {ref_group.capitalize()} - {target_group.capitalize()}'
                    plot_filename = f'plots/{lang_code}/{lang_code}_{ref_group}-{target_group}_{plot_type}.png'
                    filtered_data = control_data[(control_data['REFERENCE GROUP'] == ref_group) & (control_data['TARGET GROUP'] == target_group)]
                    plot_and_save(filtered_data, plot_title, plot_filename, plot_type, ref_group, font_size)  # Include ref_group
                    report_filename = f'reports/{lang_code}_control_report.txt'
                    write_statistics_to_report(filtered_data, plot_title, report_filename)
            
            # Call the plotting function for each adjective association
            for adj_association in targets:
                control_data = create_control_test_dataframe(lang_code, nouns_df, model)
                control_data.to_parquet(f'{control_data_dir}/{lang_code}_control_data.parquet')

                # Call the plotting function
                plot_adjective_targets_nouns(lang_code, control_data, nouns_df, model)
            
        # Conduct Experimental Tests
        if procedures['conduct_experimental_tests']:
            
            print(f"Conducting experimental tests for {lang_data['full_name']}")
            test_data_dir = f'test_data/{lang_code}'
            os.makedirs(test_data_dir, exist_ok=True)

            experimental_data = create_experimental_test_dataframe(lang_code, nouns_df, model, use_groupby)
            experimental_data.to_parquet(f'{test_data_dir}/{lang_code}_test_data.parquet')

            plot_title = f'{lang_data["full_name"]}: Nouns - Adjectives'
            plot_filename = f'plots/{lang_code}/{lang_code}_nouns-adjectives_{plot_type}.png'
            plot_and_save(experimental_data, plot_title, plot_filename, plot_type, 'experimental', font_size)  # Added 'experimental' as the ref_group for experimental tests
            report_filename = f'reports/{lang_code}_experimental_report.txt'
            write_statistics_to_report(experimental_data, plot_title, report_filename)

    print("Finished processing, plotting, and generating reports for tests.")

In [110]:
# sns.set_theme()

run(
    procedures={
        'load_models':False,
        
        'crawl_wiktionary_nouns':False,
        'crawl_wiktionary_adjectives':False,

        'calculate_adjective_similarities':False,

        'select_top_adjectives':True,

        'remove_adjective_duplicates':True,
        'adjective_definition_review':False,
        'remove_unwanted_adjectives':True,

        'create_stimulus_files':True,

        'conduct_control_tests':True,
        'conduct_experimental_tests':True,
    },
    models=models,
    load_method='normal',
    # normal, facebook
    nouns_file='nouns.csv',
    top_n_adjectives=100,
    adjective_gender_association_method='cosine_similarity',
    remove_adjectives_with_markers = ["dated", "archaic", "dialectal", "rare", "ordinal number", "obsolete", "offensive"],
    # semantic differential
    # to add: projection, simple selection (cossim(reference, target)), exclusive selection (cossim(reference, (target1 - target2)))
    plot_type='box',
    # box, strip
    # to add: violin, swarm
    # to fix: boxplots respond to order, hue_order parameters w/ boxprops error; for some plots are of irregular order
    use_groupby=True,
    # for experimental tests, group adjectives for a given noun into one average cosine similarity, so instead of n(nouns)*n(adjectives) data points, you only have n(nouns) datapoints. best for strip plots to see individual points.
    semantic_differential_vectors='gender1-gender2',
    # gender1-gender2, gender-person
    unallowed_words=['lesb', 'debonair', 'vestal', 'sunamita', 'negrid', 'Brummagem', 'follable', 'untervögelt', 'schasaugert', 'Emeser', 'fünfhundertste', 'Poppersch', 'Schlänger', 'Römer', 'Latina', 'titless', 'pussy', 'foine', 'mosuo', 'fáustico', 'indio', 'rixig', 'hiborio', 'abgeschmack', 'kaki', 'klaviform', 'TK', 'antimalthusianisch', 'Danubian', 'eblaitisch', 'elfminütig', 'Fregesch', 'jakobinisch', 'Malthusianisch', 'meißenisch', 'neunminütig', 'rahn', 'vierzigminütig', 'zwölfminütig', 'Afro-Latina', 'Dianic', 'Filipina', 'lady-like', 'MAAB', 'menstruate', 'obstetrical', 'Quebecoise', 'Rubenesque', 'woman-centric', 'vinny', 'twinky', 'Welshy', 'turrible', 'mick', 'fooking', 'particuler', 'legendry', 'awsome', 'roy', 'neo-Hegelian', 'phun', 'niiice', 'Democritean', 'Hegelian', 'Rothbardian', 'gent', 'afrodescendiente', 'axumita', 'curvi', 'delhita', 'feminazi', 'madrense', 'mizrají', 'oseta', 'postparto', 'sefaradita', 'sefardí', 'sefardita', 'transgenerista', 'fuckin', 'hanbalitisch', 'antimalthusianisch', 'Malthusianisch', 'antimalthusianisch', 'malthusisch','gustiös', 'hanbalitisch','handgehoben','scheiß', 'sturm', 'terrisch','Madonna-like','smoove', 'tuff','hench','insano', 'mofo', 'cutty', 'piff', 'jake', 'propa', 'mank','LGBT','papaya', 'child-bearing', 'plus-sized', 'post-partum', 'vulval', 'ben', 'unpossible', 'antifeminist', 'LGTB', 'LGTBI', 'babylonisch', 'erzgebirgisch', 'hinreissend', 'niedersorbisch', 'Sanct', 'sasanidisch', 'saudisch', 'altniederländisch', 'bohrsch', 'britannisch', 'danubisch', 'dreiundvierzigminütig', 'drittelzahlig', 'etatmässig', 'fünfundzwanzigminütig', 'fünfunddreißigminütig', 'fünfminütig', 'Hitlersch', 'koblenzisch', 'Luthersch', 'sechzigminütig', 'südatlantisch', 'Cesarean', 'prochoice', 'almight', 'cock-sure', 'cooool', 'nooby', 'peart', 'phantastic', 'Smithian', 'barakaldarra', 'bartorosellista', 'cefeida', 'chilota', 'dailamita', 'estambulita', 'kábila', 'mazahua', 'ondarrutarra', 'ranjana', 'helle', 'zirkummediterran', 'südatlantisch', 'preggers', 'vajazzled', 'shite', 'steezy', 'tinhorn', 'widdly', 'afrotropical', 'apollardado', 'mijita', 'ladilla', 'gray-haired', 'heavy-set', 'middleaged', 'Jew,' 'moustached', 'African-American', 'childbearing', 'Filipina', 'Madonna-like', 'newly-wed', 'Shunamite', 'Syrophoenician', 'teen-age', 'teen-aged', 'transgendered', 'grown-ass', 'mustached', 'Caucasian', 'biracial', 'mixed-race', 'thirties', 'forties', 'clean-shaved', 'moustachioed', 'dark-skinned', 'teenaged', 'mustachioed', 'ape', 'Afroestadounidense', 'Birracial', 'Indígena', 'sexi', 'Extraconyugal', 'Israelita', 'untrew', 'cristiano', 'jóven', 'afroestadounidense', 'birracial', 'Emesener', 'baktrisch', 'israelita', '♥-lich', 'vierzigmonatig', 'währschaft', 'wolgadeutsch', 'amisch', 'dreiundfünfzigjährig', 'kraftwerkisch', 'malisch', 'Palmyrer', 'Portaner' ,'achtundvierzigmonatig', 'padre', 'hypoäolisch', 'schwatt', 'sechsunddreißigmonatig', 'israelita', 'dreißigmonatig', 'einunddreißigeckig', 'schwul', 'mannmännlich'],
    allowed_words=['rascal', 'transexual'],
    font_size=14
)

############### TOPWORDS CALLED #######################################################$$$$$$$$$$$$$$$$$$
############### COSINE SIMILARITY #######################################################$$$$$$$$$$$$$$$$$$
Added 'Alternate Form' to Spanish masculine adjectives parquet file
Added alternate form individua for adjective: individuo
Added alternate form humana for adjective: humano
Added alternate form chica for adjective: chico
Added alternate form anciana for adjective: anciano
Added alternate form sabia for adjective: sabio
Added alternate form jovencita for adjective: jovencito
Added alternate form campesina for adjective: campesino
Added alternate form adinerada for adjective: adinerado
Added alternate form fornida for adjective: fornido
Added alternate form sujeta for adjective: sujeto
Added alternate form desgraciada for adjective: desgraciado
Added alternate form vagabunda for adjective: vagabundo
Added alternate form guapa for adjective: guapo
Added alternate form cristiana

  selected_masculine = pd.concat([selected_masculine, top_masculine])
  selected_feminine = pd.concat([selected_feminine, top_feminine])
  df.at[index, 'Alternate Form'] = adjective[:-1] + 'a'
  df.at[index, 'Alternate Form'] = adjective[:-1] + 'o'
  selected_masculine = pd.concat([selected_masculine, top_masculine])
  selected_feminine = pd.concat([selected_feminine, top_feminine])


Updated DataFrame saved as adjectives/de_feminine_adjectives.csv and adjectives/de_feminine_adjectives.parquet
Copied adjectives/es_masculine_adjectives.csv to adjectives/stimulus_files/es_masculine_adjectives.csv
Copied adjectives/es_feminine_adjectives.csv to adjectives/stimulus_files/es_feminine_adjectives.csv
Copied adjectives/de_masculine_adjectives.csv to adjectives/stimulus_files/de_masculine_adjectives.csv
Copied adjectives/de_feminine_adjectives.csv to adjectives/stimulus_files/de_feminine_adjectives.csv
Processing language: Spanish
Conducting control tests for Spanish
Creating control test dataframe for language: es
Processing nouns for gender: feminine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing nouns for gender: masculine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing masculine adjectives
Calculating similarities for masculine adjectives with ge

  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Associ

Creating control test dataframe for language: es
Processing nouns for gender: feminine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing nouns for gender: masculine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing masculine adjectives
Calculating similarities for masculine adjectives with genders
Calculating similarities for masculine adjectives with determiners
Processing feminine adjectives
Calculating similarities for feminine adjectives with genders
Calculating similarities for feminine adjectives with determiners
Columns in control_data DataFrame: ['LANGUAGE', 'REFERENCE GROUP', 'REFERENCE ASSOCIATION', 'REFERENCE WORD', 'TARGET GROUP', 'TARGET WORD', 'COSINE SIMILARITY']
Control test dataframe created for language: es, Rows: 1300
Control test dataframe created for language: es, Rows: 1300


  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Associ

Conducting experimental tests for Spanish
Creating experimental test dataframe for language: es
Experimental test dataframe created for language: es, Rows: 178
Plotting: Spanish: Nouns - Adjectives
Plot saved: plots/es/es_nouns-adjectives_box.png
Writing statistics to report: reports/es_experimental_report.txt
Report written: reports/es_experimental_report.txt
Processing language: German
Conducting control tests for German
Creating control test dataframe for language: de
Processing nouns for gender: masculine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing nouns for gender: feminine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing masculine adjectives
Calculating similarities for masculine adjectives with genders
Calculating similarities for masculine adjectives with determiners
Processing feminine adjectives
Calculating similarities for feminine adjectives with 

  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Associ

Creating control test dataframe for language: de
Processing nouns for gender: masculine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing nouns for gender: feminine
Calculating similarities for nouns with genders
Calculating similarities for nouns with determiners
Processing masculine adjectives
Calculating similarities for masculine adjectives with genders
Calculating similarities for masculine adjectives with determiners
Processing feminine adjectives
Calculating similarities for feminine adjectives with genders
Calculating similarities for feminine adjectives with determiners
Columns in control_data DataFrame: ['LANGUAGE', 'REFERENCE GROUP', 'REFERENCE ASSOCIATION', 'REFERENCE WORD', 'TARGET GROUP', 'TARGET WORD', 'COSINE SIMILARITY']
Control test dataframe created for language: de, Rows: 1145
Control test dataframe created for language: de, Rows: 1145


  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Association', palette=['blue', 'orange'])
  sns.scatterplot(data=plot_data, x='Similarity to Gender Word', y='Avg Similarity to Gender Nouns', hue='Associ

Conducting experimental tests for German
Creating experimental test dataframe for language: de
Experimental test dataframe created for language: de, Rows: 178
Plotting: German: Nouns - Adjectives
Plot saved: plots/de/de_nouns-adjectives_box.png
Writing statistics to report: reports/de_experimental_report.txt
Report written: reports/de_experimental_report.txt
Finished processing, plotting, and generating reports for tests.


In [128]:
import pandas as pd

# Replace 'your_file.csv' with your actual file name
df = pd.read_csv('balanced_nouns.csv')

# Getting the number of rows
number_of_rows = len(df)
print("Number of rows in the CSV file:", number_of_rows)


Number of rows in the CSV file: 267


In [112]:
import csv
import math
import itertools
from collections import defaultdict

def cossim(vec1, vec2):
    """Return cosine similarity between vec1 and vec2"""
    dot_product = sum(a*b for a, b in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum([val**2 for val in vec1]))
    magnitude2 = math.sqrt(sum([val**2 for val in vec2]))
    return dot_product / (magnitude1 * magnitude2)

def get(model, word):
    """Return word embedding for word as in model"""
    return model.get_word_vector(word)

def top_cosine_similarity_pairs(csv_file, model):
    # Read adjectives from the "Adjective" column in the CSV
    adjectives = []
    with open(csv_file, newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if 'Adjective' in row:
                adjectives.append(row['Adjective'])

    # Calculate cosine similarity for all unique pairs
    pairs = itertools.combinations(adjectives, 2)
    similarity_scores = defaultdict(float)
    for word1, word2 in pairs:
        vec1 = get(model, word1)
        vec2 = get(model, word2)
        similarity = cossim(vec1, vec2)
        similarity_scores[(word1, word2)] = similarity

    # Sort pairs by similarity score
    sorted_pairs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the top 5%
    top_5_percent_index = int(len(sorted_pairs) * 0.001)
    top_pairs = sorted_pairs[:top_5_percent_index]

    return top_pairs

# Assuming top_cosine_similarity_pairs and other necessary functions are already defined
# and your word embedding models are loaded into 'models'

directory = "adjectives/stimulus_files"

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Select the appropriate model based on the filename
        if "de" in filename:
            model = models['de']
        elif "es" in filename:
            model = models['es']
        elif "en" in filename:
            model = models['en']
        else:
            print(f"Language model for {filename} not found.")
            continue

        file_path = os.path.join(directory, filename)
        top_pairs = top_cosine_similarity_pairs(file_path, model)

        # Print the top 5% pairs in a legible format
        print('#################################')
        print(f"Top pairs for {filename}:")
        print('#################################')
        for (word1, word2), similarity in top_pairs:
            print(f"{word1}, {word2} - Cosine Similarity: {similarity:.4f}")


#################################
Top pairs for de_feminine_adjectives.csv:
#################################
transsexuell, intersexuell - Cosine Similarity: 0.7170
krebskrank, sterbenskrank - Cosine Similarity: 0.6682
#################################
Top pairs for es_masculine_adjectives.csv:
#################################
fornido, corpulento - Cosine Similarity: 0.8345
adinerado, acaudalado - Cosine Similarity: 0.8102
#################################
Top pairs for es_feminine_adjectives.csv:
#################################
casado, divorciado - Cosine Similarity: 0.8197
inmigrante, emigrante - Cosine Similarity: 0.8195
#################################
Top pairs for de_masculine_adjectives.csv:
#################################
betrunken, besoffen - Cosine Similarity: 0.8067
alt, jung - Cosine Similarity: 0.7871
#################################
Top pairs for en_feminine_adjectives.csv:
#################################
marriageable, unmarriageable - Cosine Similarity: 0.8516
w

In [119]:
import pandas as pd
import os

directory = "adjectives/stimulus_files"


# Function to find the minimum row count
def find_min_row_count(files):
    min_count = float('inf')
    for file in files:
        df = pd.read_csv(os.path.join(directory, file))
        min_count = min(min_count, len(df))
    return min_count

# List CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Find the minimum row count
min_row_count = find_min_row_count(csv_files)

# Truncate files based on criteria
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file))
    print(f"Processing file: {file}")
    print("Columns in DataFrame:", df.columns.tolist())

    if 'masculine' in file.lower() and 'masculine_score' in df.columns:
        df = df.nsmallest(min_row_count, 'masculine_score')
    elif 'feminine' in file.lower() and 'feminine_Score' in df.columns:
        df = df.nsmallest(min_row_count, 'feminine_Score')
    
    # Save the truncated file
    df.to_csv(os.path.join(directory, file), index=False)

# Print the final row count
print(f"Final row count for all files: {min_row_count}")


Processing file: de_feminine_adjectives.csv
Columns in DataFrame: ['Adjective', 'Language', 'masculine_similarity', 'feminine_similarity', 'exclusive_masculine_similarity', 'exclusive_feminine_similarity', 'neuter_similarity', 'depersonalized_masculine_similarity', 'depersonalized_feminine_similarity', 'masculine_score', 'feminine_score', 'Definition']
Processing file: es_masculine_adjectives.csv
Columns in DataFrame: ['Adjective', 'Language', 'masculine_similarity', 'feminine_similarity', 'exclusive_masculine_similarity', 'exclusive_feminine_similarity', 'neuter_similarity', 'depersonalized_masculine_similarity', 'depersonalized_feminine_similarity', 'masculine_score', 'feminine_score', 'Alternate Form', 'Definition']
Processing file: es_feminine_adjectives.csv
Columns in DataFrame: ['Adjective', 'Language', 'masculine_similarity', 'feminine_similarity', 'exclusive_masculine_similarity', 'exclusive_feminine_similarity', 'neuter_similarity', 'depersonalized_masculine_similarity', 'depe

In [123]:
import pandas as pd

def convert_to_feminine(adj):
    if adj.endswith('o'):
        return adj[:-1] + 'a'
    elif adj.endswith('or'):
        return adj + 'a'
    elif adj.endswith('án') or adj.endswith('ón') or adj.endswith('ín'):
        return adj[:-2] + 'ana'
    # Add more rules as necessary
    return adj  # Return as-is for adjectives that don't change

def convert_to_masculine(adj):
    if adj.endswith('a') and not adj.endswith('ista'):
        return adj[:-1] + 'o'
    # Add more rules as necessary
    return adj  # Return as-is for adjectives that don't change

def process_adjectives(file_path, gender):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Check if 'Adjective' column exists
    if 'Adjective' not in df.columns:
        raise ValueError("CSV file does not have an 'Adjective' column.")

    # Apply the conversion based on gender
    if gender == 'masculine':
        df['Masculine'] = df['Adjective']
        df['Feminine'] = df['Adjective'].apply(convert_to_feminine)
    elif gender == 'feminine':
        df['Masculine'] = df['Adjective'].apply(convert_to_masculine)
        df['Feminine'] = df['Adjective']

    # Determine output file name
    output_file = file_path.replace('stimulus_files', 'random_order').replace('.csv', '_random_order.csv')

    # Write to a new CSV file
    df[['Masculine', 'Feminine']].to_csv(output_file, index=False)
    print(f"Output saved to {output_file}")

# Example usage
process_adjectives("adjectives/stimulus_files/es_masculine_adjectives.csv", "masculine")
process_adjectives("adjectives/stimulus_files/es_feminine_adjectives.csv", "feminine")


Output saved to adjectives/random_order/es_masculine_adjectives_random_order.csv
Output saved to adjectives/random_order/es_feminine_adjectives_random_order.csv


In [131]:
import pandas as pd

def count_adjectives_by_group(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Group by 'ASSOCIATION/GRAMMATICAL GENDER' and count
    group_counts = df.groupby('ASSOCIATION/GRAMMATICAL GENDER').size()

    # Print the count for each group
    print("Counts for each 'ASSOCIATION/GRAMMATICAL GENDER' group:")
    print(group_counts)

# Example usage
count_adjectives_by_group('nouns.csv')


Counts for each 'ASSOCIATION/GRAMMATICAL GENDER' group:
ASSOCIATION/GRAMMATICAL GENDER
feminine     89
masculine    89
neuter       89
dtype: int64


In [140]:
import pandas as pd

def count_adjectives_by_language_and_gender(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Group by 'LANGUAGE' and 'ASSOCIATION/GRAMMATICAL GENDER' and count
    group_counts = df.groupby(['LANGUAGE', 'ASSOCIATION/GRAMMATICAL GENDER']).size()

    # Print the count for each group within each language
    print("Counts for each 'ASSOCIATION/GRAMMATICAL GENDER' group within each language:")
    print(group_counts)

# Example usage
count_adjectives_by_language_and_gender('nouns.csv')


Counts for each 'ASSOCIATION/GRAMMATICAL GENDER' group within each language:
LANGUAGE  ASSOCIATION/GRAMMATICAL GENDER
de        feminine                          40
          masculine                         40
en        neuter                            80
es        feminine                          40
          masculine                         40
dtype: int64


In [133]:
import pandas as pd

def equalize_adjectives(file_path, protected_translations):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Group by 'LANGUAGE' and 'ASSOCIATION/GRAMMATICAL GENDER' and count
    group_counts = df.groupby(['LANGUAGE', 'ASSOCIATION/GRAMMATICAL GENDER']).size()

    # Exclude English and find the minimum count
    target_count = group_counts.drop('en').min()

    # Filter and equalize each group
    result_df = pd.DataFrame()
    for (language, gender), count in group_counts.items():
        if language == 'en':  # Skip English
            result_df = result_df._append(df[(df['LANGUAGE'] == language) & 
                                            (df['ASSOCIATION/GRAMMATICAL GENDER'] == gender)])
            continue

        # Filter the group
        group_df = df[(df['LANGUAGE'] == language) & 
                      (df['ASSOCIATION/GRAMMATICAL GENDER'] == gender)]

        # Separate protected and non-protected items
        protected_df = group_df[group_df['TRANSLATION'].isin(protected_translations)]
        non_protected_df = group_df[~group_df['TRANSLATION'].isin(protected_translations)]

        # Reduce the non-protected items if necessary
        if count > target_count:
            num_to_keep = target_count - len(protected_df)
            non_protected_df = non_protected_df.sample(n=num_to_keep)

        # Append to the result DataFrame
        result_df = result_df._append(protected_df, ignore_index=True)
        result_df = result_df._append(non_protected_df, ignore_index=True)

    # Write to a new CSV file
    output_file = 'equalized_adjectives.csv'
    result_df.to_csv(output_file, index=False)
    print(f"Output saved to {output_file}")

# Example usage
protected_translations = [
    'toaster', 'moon', 'spoon', 'broom', 'whale', 'frog', 
    'clock', 'sun', 'fork', 'toothbrush', 'mouse', 'snail', 'cat'
]
equalize_adjectives('nouns.csv', protected_translations)


Output saved to equalized_adjectives.csv
