# Compare sentiments of women and men

## Frequencies

In [2]:
import pandas as pd

# read data
path_df = "/home/shared_folder/Twitter_samu/MA_researcher_wellbeing/data/timelines_sentiments.json"
df = pd.read_json(path_df, orient="records", lines=True, dtype={"user_id": str})


In [3]:
# frequency table for column "gender"
print(df["gender"].value_counts())

gender
female           1376037
male             1232720
unknown           584622
mostly_female     146537
mostly_male       103531
andy               44978
Name: count, dtype: int64


In [5]:
# keep only unique user_id
df_unique = df.drop_duplicates(subset="user_id")

In [6]:

# frequency table for column "gender"
print(df_unique["gender"].value_counts())


gender
female           7288
male             4254
unknown          2102
mostly_female     598
mostly_male       387
andy              247
Name: count, dtype: int64


In [7]:

# how many rows in df_unique
print(len(df_unique))


14876


In [8]:

# divide frequency table by total number of rows
print(df_unique["gender"].value_counts() / len(df_unique))


gender
female           0.489917
male             0.285964
unknown          0.141301
mostly_female    0.040199
mostly_male      0.026015
andy             0.016604
Name: count, dtype: float64


## Compare mean sentiment scores

In [11]:
# aggregate gender, pos_sent and neg_sent by user_id, keep gender
df_grouped = df.groupby("user_id").agg({
    "gender": "first",
    "pos_sent": "mean",
    "neg_sent": "mean"
})

# head
print(df_grouped.head())

                     gender  pos_sent  neg_sent
user_id                                        
1000022414401695744    male  0.458170  0.137387
100005968            female  0.915317  0.023297
1000098663547506688    male  0.320237  0.255474
1000105711089606656  female  0.919061  0.004058
1000115004740390912  female  0.865056  0.055644


In [12]:
mean_sentiments_pos = df_grouped.groupby("gender")["pos_sent"].mean()
print(mean_sentiments_pos)

gender
andy             0.592234
female           0.605245
male             0.481527
mostly_female    0.590133
mostly_male      0.522459
unknown          0.529786
Name: pos_sent, dtype: float64


In [13]:
mean_sentiments_neg = df_grouped.groupby("gender")["neg_sent"].mean()
print(mean_sentiments_neg)

gender
andy             0.108312
female           0.114349
male             0.150111
mostly_female    0.128031
mostly_male      0.140710
unknown          0.142321
Name: neg_sent, dtype: float64


In [20]:
# median pos sentiment scores
median_sentiments_pos = df_grouped.groupby("gender")["pos_sent"].median()
print(median_sentiments_pos)

gender
andy             0.611665
female           0.612919
male             0.468117
mostly_female    0.592636
mostly_male      0.513700
unknown          0.523277
Name: pos_sent, dtype: float64


In [21]:
# median neg sentiment scores
median_sentiments_neg = df_grouped.groupby("gender")["neg_sent"].median()
print(median_sentiments_neg)

gender
andy             0.088647
female           0.090698
male             0.135446
mostly_female    0.111146
mostly_male      0.127340
unknown          0.113961
Name: neg_sent, dtype: float64


In [15]:
# filter out missing values
df_filtered = df_grouped.dropna(subset=["pos_sent", "neg_sent"])

In [16]:
# check for normality
from scipy.stats import shapiro

# normality test for positive sentiment scores
stat_female_pos, p_val_female_pos = shapiro(df_filtered[df_filtered["gender"] == "female"]["pos_sent"])
print(f"Shapiro-Wilk test for normality of positive sentiment scores for females: statistic = {stat_female_pos}, p-value = {p_val_female_pos}")

Shapiro-Wilk test for normality of positive sentiment scores for females: statistic = 0.9896383881568909, p-value = 1.0853484073231942e-22




In [17]:
# normality test for negative sentiment scores
stat_female_neg, p_val_female_neg = shapiro(df_filtered[df_filtered["gender"] == "female"]["neg_sent"])
print(f"Shapiro-Wilk test for normality of negative sentiment scores for females: statistic = {stat_female_neg}, p-value = {p_val_female_neg}")

Shapiro-Wilk test for normality of negative sentiment scores for females: statistic = 0.8570480942726135, p-value = 0.0


In both tests, the p value is <.001. This indicates that the data is not normally distributed. Hence, instead of a t-test, we use Mann-Whitney U test.

In [18]:
# Mann-Whitney U test for sentiment scores
from scipy.stats import mannwhitneyu

# Mann-Whitney U test for positive sentiment scores
stat_pos, p_val_pos = mannwhitneyu(df_filtered[df_filtered["gender"] == "female"]["pos_sent"], df_filtered[df_filtered["gender"] == "male"]["pos_sent"])
print(f"Mann-Whitney U test for positive sentiment scores: statistic = {stat_pos}, p-value = {p_val_pos}")

Mann-Whitney U test for positive sentiment scores: statistic = 20996369.0, p-value = 3.592049984911082e-222


In [19]:
# Mann-Whitney U test for negative sentiment scores
stat_neg, p_val_neg = mannwhitneyu(df_filtered[df_filtered["gender"] == "female"]["neg_sent"], df_filtered[df_filtered["gender"] == "male"]["neg_sent"])
print(f"Mann-Whitney U test for negative sentiment scores: statistic = {stat_neg}, p-value = {p_val_neg}")

Mann-Whitney U test for negative sentiment scores: statistic = 12262428.0, p-value = 1.7030620756088935e-78


In [31]:
# rank-biserial correlations
from scipy.stats import rankdata

# rank-biserial correlation for positive sentiment scores
r_pos = 1 - (2 * stat_pos) / (len(df_filtered) * (len(df_filtered)))
print(f"Rank-biserial correlation for positive sentiment scores: {r_pos}")


Rank-biserial correlation for positive sentiment scores: 0.8102539867877426


In [32]:
# rank-biserial correlation for negative sentiment scores
r_neg = 1 - 2 * (stat_neg / (len(df_filtered) * (len(df_filtered) + 1)))
print(f"Rank-biserial correlation for negative sentiment scores: {r_neg}")

Rank-biserial correlation for negative sentiment scores: 0.8891833714056772


In [47]:
# Sample sizes
n_female = len(df_filtered[df_filtered["gender"] == "female"]["pos_sent"])
n_male = len(df_filtered[df_filtered["gender"] == "male"]["pos_sent"])

# Calculate Rank-Biserial Correlation
U = stat_pos  # Mann-Whitney U statistic

# Calculate r
r_pos = 1 - ((2 * U) / (n_female * n_male))

print(f"Rank-Biserial Correlation for positive sentiment scores: {r_pos}")

Rank-Biserial Correlation for positive sentiment scores: -0.3544667329308968


In [48]:
# Sample sizes
n_female = len(df_filtered[df_filtered["gender"] == "female"]["neg_sent"])
n_male = len(df_filtered[df_filtered["gender"] == "male"]["neg_sent"])

# Calculate Rank-Biserial Correlation
U = stat_neg  # Mann-Whitney U statistic

# Calculate r
r_neg = 1 - ((2 * U) / (n_female * n_male))

print(f"Rank-Biserial Correlation for negative sentiment scores: {r_neg}")

Rank-Biserial Correlation for negative sentiment scores: 0.2089560442112467
