# Data Preprocesss and Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns

### Data preprocess
`convert_female_to_male(occupation)`: if an occupation is in the female form, it returns the male form. 


In [2]:
class Preprocess:
    @staticmethod
    def convert_female_to_male(occupation):
        if occupation == 'directrice':
            return 'directeur'
        if occupation == 'bibliothecaresse':
            return 'bibliothecaris'
        if occupation == 'boerin':
            return 'boer'
        if occupation == 'advocate':
            return 'advocaat'
        if occupation == 'kunstenares':
            return 'kunstenaar'
        if occupation == 'detectiv':
            return 'detective'
        if occupation == 'fotografe':
            return 'fotograaf'
        if occupation == 'journaliste':
            return 'journalist'        
        if occupation == 'architecte':
            return occupation[:-1]
        if occupation == 'archeologe':
            return 'archeoloog'
        if occupation == 'zakenvrouw':
            return 'zakenpersoon'
        if occupation == 'zakenman':
            return 'zakenpersoon'
        if occupation == 'politieman':
            return 'politieagent'
        if occupation == 'politieagente':
            return 'politieagent'
        if occupation == 'restauranthoudser':
            return 'restauranthouder'
        if occupation == 'verpleegser':
            return 'verpleegkundiger'
        if occupation == 'bloemiste':
            return 'bloemist'
        if occupation.endswith('ess'):
            return occupation[:-3]
        elif occupation.endswith('es'):
            return occupation[:-2]
        if occupation.endswith('pser'):
            return occupation[:-4] + 'per'
        elif occupation.endswith('a'):
            return occupation[:-1] + 'us'
        elif occupation.endswith('in'):
            return occupation[:-2] + 'er'
        elif occupation.endswith('ster'):
            return occupation[:-3] + 'er'
        else:
            return occupation
    
    @staticmethod
    def preprocess(file_location):
        df = pd.read_csv(file_location, sep=';', encoding='latin-1')
        df['Story'] = df['Story'].str.lstrip()
        df['Story'] = df['Story'].str.replace('\n', '')

        df['Gender_ENG'] = 'Neutral'
        for index, row in df.iterrows():
            if row['Gender_output'] == 'man':
                df.at[index, 'Gender_ENG'] = 'Male'
            if row['Gender_output'] == 'vrouw':
                df.at[index, 'Gender_ENG'] = 'Female'

        df['Story'].str.lstrip()
        df['Occupation'] = df['Occupation'].apply(Preprocess.convert_female_to_male)

        return df

### Data analysis

In [3]:
class Analysis:
    @staticmethod
    def plot_top_occ_gender(df, n=5, model='GPT-3.5'):
        occupation_count = df.groupby(['Gender_ENG', 'Occupation']).size().reset_index(name='Count')

        def plot_top_occupations(gender, title):
            top_occupations = occupation_count[occupation_count['Gender_ENG'] == gender].nlargest(n, 'Count')
            plt.figure(figsize=(10, 5))
            plt.bar(top_occupations['Occupation'], top_occupations['Count'])
            plt.title(f'Top {n} Occupations for {title} in {model}')
            plt.xlabel('Occupation')
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

        # Plot for Women and Men
        plot_top_occupations('Female', 'Women')
        plot_top_occupations('Male', 'Men')

    @staticmethod
    def get_top_occupations(df, n = 10):
        gender_counts = df.groupby('Occupation')['Gender_ENG'].value_counts().unstack(fill_value=0)
        gender_counts['Total'] = gender_counts.sum(axis=1)
        top_occupations = gender_counts.sort_values(by='Total', ascending=False).head(n)
        return top_occupations

    @staticmethod	
    def plot_top_occupations(df_top_occupations, file_name=None, title="Top", model="GPT-3.5"):
        df_top_occupations[['Male', 'Female']].plot(kind='bar', stacked=True)
        plt.xlabel('Occupation')
        plt.ylabel('Number of Stories')
        plt.xticks(rotation=45, ha='right')
        plt.title(f'{title} {len(df_top_occupations)} occupations {model}')
        plt.legend(title="Gender")
        plt.tight_layout()
        
        if file_name:
            file_location = f"plots/{file_name}"
            plt.savefig(file_location)

        plt.show()

    @staticmethod
    def odds_ratio(df):
        total_gender = df['Gender_ENG'].value_counts()
        total_m = total_gender.get('Male', 0)
        total_f = total_gender.get('Female', 0)
        
        occupation_counts = df.groupby('Occupation')['Gender_ENG'].value_counts().unstack(fill_value=0).to_dict('index')

        odds_ratios = {}
        small_const = 0.5  # Add small constant to avoid division by zero

        for occupation, genders in occupation_counts.items():
            males = genders.get('Male', 0)
            females = genders.get('Female', 0)
            a = males + small_const
            b = total_m - males + small_const
            c = females + small_const
            d = total_f - females + small_const
            odds_ratio = (a * d) / (b * c)
            odds_ratios[occupation] = odds_ratio

        sorted_odds = sorted(odds_ratios.items(), key=lambda x: x[1])
        return sorted_odds[-10:], sorted_odds[:10]  # Top 10 and Bottom 10
    
    @staticmethod
    def plot_odds_ratio(list_odds, file_name = None, title="top", model="GPT-3.5"):
        occupations, values = zip(*list_odds)
        plt.figure(figsize=(10, 8))
        plt.barh(occupations, values, color='skyblue')
        plt.xlabel('Odds ratio')
        plt.title(f'Odds ratios {title} {len(occupations)} occupations {model}')
        plt.gca().invert_yaxis()

        for index, value in enumerate(values):
            plt.text(value, index, f" {value:.2f}", va='center')

        plt.tight_layout()

        if file_name:
            file_location = f"plots/{file_name}"
            plt.savefig(file_location)

        plt.show()
    
    @staticmethod
    def genre_occ(df, n=5):
        count_gender = df.groupby(['Genre', 'Occupation', 'Gender_ENG']).size().unstack(fill_value=0)
        count_gender['Total'] = count_gender.sum(axis=1)
        top_n_genre = count_gender.groupby('Genre', group_keys=False).apply(lambda x: x.nlargest(n, 'Total')).reset_index()
        top_n_genre['Genre_Occupation'] = top_n_genre['Genre'].str[:3] + " - " + top_n_genre['Occupation']
        return top_n_genre
    
    
    def plot_genre_occ(df_genre_occ, file_name = None):
        n = len(df_genre_occ.index)/3

        plt.figure(figsize=(12, 8))

        sns.barplot(data=df_genre_occ, x='Genre_Occupation', y='Total', hue='Genre', dodge=False)

        plt.xticks(rotation=45, ha="right")
        plt.title('Top {} Occupations per Genre'.format(n))
        plt.xlabel('Genre and Occupation')
        plt.ylabel('Number of Stories')

        plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')

        plt.tight_layout()

        if file_name:
            file_location = f"plots/{file_name}"
            plt.savefig(file_location)
        
        plt.show()
    
    @staticmethod
    def plot_dist_genre_occ(df_genre_occ):
        genres = df_genre_occ['Genre'].unique()

        for genre in genres:
            genre_data = df_genre_occ[df_genre_occ['Genre'] == genre]
            plt.figure(figsize=(10, 6))
            genre_data.set_index('Occupation')[['Male', 'Female']].plot(kind='bar', stacked=True, color=['orange', 'green'])
            plt.title(f'Top {len(genre_data)} Occupation in {genre}')
            plt.xlabel('Occupation')
            plt.ylabel('Number')
            plt.legend(title='Gender')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()