In [1]:
import pandas as pd
import numpy as np

In [2]:
actions = pd.read_parquet('../data/gr-work-actions.parquet')

actions['timestamp'] = pd.to_datetime(actions['first_time'], unit='s')

del actions['first_time']
del actions['last_time']

actions.sort_values('timestamp', inplace=True)
actions.set_index('timestamp', inplace=True)

In [3]:
gender = pd.read_parquet('../data/gr-work-gender.parquet')

gender.drop_duplicates(subset=['gr_item'], inplace=True)

gender = gender.set_index('gr_item')['gender']

gender[gender.str.startswith('no-')] = 'unlinked'

gender = gender.astype('category')
gender.index.name = 'item_id'

In [4]:
merged_df = actions.join(gender, on='item_id', how='left')

merged_df['gender'].fillna('unlinked', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['gender'].fillna('unlinked', inplace=True)


In [5]:
authors = pd.read_parquet('../data/gr-work-item-authors.parquet')
authors = authors.groupby('item_id', as_index=True)['author_id'].first().to_frame()

In [6]:
authors.head()

Unnamed: 0_level_0,author_id
item_id,Unnamed: 1_level_1
400000040,2553
400000062,7555163
400000081,5315
400000084,5325
400000087,5353


In [7]:
merged_df = merged_df.join(authors, on='item_id', how='left')

In [8]:
df = merged_df.copy()

In [9]:
def compute_summary(df_subset):
    return {
        'Unique Users': df_subset['user_id'].nunique(),
        'Unique Books': df_subset['item_id'].nunique(),
        'Unique Authors': df_subset['author_id'].nunique(),
        'Average Rating': df_subset.groupby('item_id')['last_rating'].mean().mean(),
    }

In [10]:
summary = {
    'Overall': compute_summary(df),
    'Male': compute_summary(df[df['gender'] == 'male']),
    'Female': compute_summary(df[df['gender'] == 'female']),
}

In [11]:
summary_df = pd.DataFrame(summary)

summary_df.loc[['Unique Users', 'Unique Books', 'Unique Authors']] = summary_df.loc[
    ['Unique Users', 'Unique Books', 'Unique Authors']
].astype(int)

summary_df.loc[['Average Rating']] = summary_df.loc[
    ['Average Rating']
].round(2)

pd.set_option('display.float_format', '{:,.2f}'.format)

print(summary_df)

                    Overall       Male     Female
Unique Users     876,145.00 824,001.00 805,003.00
Unique Books   1,522,486.00 348,302.00 231,345.00
Unique Authors   612,241.00 155,888.00  87,904.00
Average Rating         3.85       3.80       3.79


In [12]:
latex_table = summary_df.to_latex(
    index=True,
    caption="Summary Statistics by Author Gender",
    label="tab:gender_summary",
    float_format="%.2f"
)

print(latex_table)

\begin{table}
\caption{Summary Statistics by Author Gender}
\label{tab:gender_summary}
\begin{tabular}{lrrr}
\toprule
 & Overall & Male & Female \\
\midrule
Unique Users & 876145.00 & 824001.00 & 805003.00 \\
Unique Books & 1522486.00 & 348302.00 & 231345.00 \\
Unique Authors & 612241.00 & 155888.00 & 87904.00 \\
Average Rating & 3.85 & 3.80 & 3.79 \\
\bottomrule
\end{tabular}
\end{table}

