In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import statsmodels.api as sm

# Set pandas to display 10 rows only
pd.options.display.max_rows = 10

## Data Import & Parsing
Importing the LIWC Processed data files across the years and concat them into 1 file.

In [1]:
df1 = pd.read_csv('liwc_Data\RC_2014_18_counts_liwc.csv')
df2 = pd.read_csv('liwc_Data\RC_2019_counts_liwc.csv')

df = pd.concat([df1, df2])

NameError: name 'pd' is not defined

## Data cleaning
# Dictionary to map original column headings to new column headings

In [None]:
column_mapping = {'A': 'parent_id',
                  'B': 'author',
                  'C': 'subreddit',
                  'D': 'created_utc',
                  'E': 'controversiality',
                  'F': 'subreddit_id', 
                  'G': 'distinguished',
                  'H': 'id',
                  'I': 'removal',
                  'J': 'utterance'}

# Rename columns using the dictionary
df.rename(columns=column_mapping, inplace=True)

In [None]:
# Group the DataFrame by province
grouped = df.groupby('subreddit')

# Create a dictionary to store DataFrames for each subreddit
subreddit_dfs = {}

# Iterate over groups
for subreddit, group_df in grouped:
    # Store each group DataFrame in the dictionary
    subreddit_dfs[subreddit] = group_df


In [None]:
# Initialize a list to store the data
table_data = []

# Iterate over each subreddit
for subreddit in subreddit_dfs:
    # Calculate the mean values
    mean_i = np.mean(subreddit_dfs[subreddit]['i'])
    mean_ppron = np.mean(subreddit_dfs[subreddit]['ppron'])
    mean_we = np.mean(subreddit_dfs[subreddit]['we'])
    mean_tone = np.mean(subreddit_dfs[subreddit]['Tone'])
    mean_posemo = np.mean(subreddit_dfs[subreddit]['posemo'])
    mean_negemo = np.mean(subreddit_dfs[subreddit]['negemo'])
    
    # Append the data to the list
    table_data.append([subreddit, mean_i, mean_ppron, mean_we, mean_tone, mean_posemo, mean_negemo])

# Define the headers
headers = ['Subreddit', 'SD of i', 'SD of ppron', 'SD of we', 'SD of tone', 'SD of posemo', 'SD of negemo']

# Print the table
print(tabulate(table_data, headers=headers, floatfmt=".4f"))

In [None]:
# Initialize a list to store the data
table_data = []

# Iterate over each subreddit
for subreddit in subreddit_dfs:
    # Calculate the sd values
    sd_i = np.std(subreddit_dfs[subreddit]['i'])
    sd_ppron = np.std(subreddit_dfs[subreddit]['ppron'])
    sd_we = np.std(subreddit_dfs[subreddit]['we'])
    sd_tone = np.std(subreddit_dfs[subreddit]['Tone'])
    sd_posemo = np.std(subreddit_dfs[subreddit]['posemo'])
    sd_negemo = np.std(subreddit_dfs[subreddit]['negemo'])
    
    # Append the data to the list
    table_data.append([subreddit, sd_i, sd_ppron, sd_we, sd_tone, sd_posemo, sd_negemo])

# Define the headers
headers = ['Subreddit', 'SD of i', 'SD of ppron', 'SD of we', 'SD of tone', 'SD of posemo', 'SD of negemo']

# Print the table
print(tabulate(table_data, headers=headers, floatfmt=".4f"))

In [None]:
# Initialize a list to store the data
table_data = []

# Iterate over each subreddit
for subreddit in subreddit_dfs:
    # Calculate the minimum values
    min_i = np.min(subreddit_dfs[subreddit]['i'])
    min_ppron = np.min(subreddit_dfs[subreddit]['ppron'])
    min_we = np.min(subreddit_dfs[subreddit]['we'])
    min_tone = np.min(subreddit_dfs[subreddit]['Tone'])
    min_posemo = np.min(subreddit_dfs[subreddit]['posemo'])
    min_negemo = np.min(subreddit_dfs[subreddit]['negemo'])
    
    # Append the data to the list
    table_data.append([subreddit, min_i, min_ppron, min_we, min_tone, min_posemo, min_negemo])

# Define the headers
headers = ['Subreddit', 'Min of i', 'Min of ppron', 'Min of we', 'Min of tone', 'Min of posemo', 'Min of negemo']

# Print the table
print(tabulate(table_data, headers=headers, floatfmt=".4f"))

In [None]:
# Initialize a list to store the data
table_data = []

# Iterate over each subreddit
for subreddit in subreddit_dfs:
    # Calculate the maximum values
    max_i = np.max(subreddit_dfs[subreddit]['i'])
    max_ppron = np.max(subreddit_dfs[subreddit]['ppron'])
    max_we = np.max(subreddit_dfs[subreddit]['we'])
    max_tone = np.max(subreddit_dfs[subreddit]['Tone'])
    max_posemo = np.max(subreddit_dfs[subreddit]['posemo'])
    max_negemo = np.max(subreddit_dfs[subreddit]['negemo'])
    
    # Append the data to the list
    table_data.append([subreddit, max_i, max_ppron, max_we, max_tone, max_posemo, max_negemo])

# Define the headers
headers = ['Subreddit', 'Max of i', 'Max of ppron', 'Max of we', 'Max of tone', 'Max of posemo', 'Max of negemo']

# Print the table
print(tabulate(table_data, headers=headers, floatfmt=".4f"))

## Chi2 Tests: Depression vs Yoga
These tests are mainly used to see if our findings replicate that of related work and convergences on the markers (positive emotion words, negative emotions words,
and pronoun usage)


In [None]:
# Chi2 Test for significance between yoga and depression for all LIWC categories
#  we are interested in 
categories = df.columns[10:]
subreddits = ["depression", "yoga"]
data = {'depression': {},
        'yoga': {}}
for category in categories:
    category_data = []
    for subreddit in subreddits:
        data[subreddit][category] = subreddit_dfs[subreddit][category].sum()
contingency_table = pd.DataFrame(data).T
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"All categories, Chi-square p-value: {p}")
contingency_table

In [None]:
# Chi2 Test for significance between yoga and depression for categories we are interested in (EMOTION WORDS)
#  we are interested in 
categories = ["Tone", "posemo", "negemo"]
subreddits = ["depression", "yoga"]

data = {'depression': {},
        'yoga': {}}
for category in categories:
    category_data = []
    for subreddit in subreddits:
        data[subreddit][category] = subreddit_dfs[subreddit][category].sum()
contingency_table = pd.DataFrame(data).T

chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"Category: Tone, posemo, negemo | Chi-square p-value: {p}")
contingency_table

In [None]:
# Chi2 Test for significance between yoga and depression for categories we are interested in (PERSONAL PRONOUNS)
#  we are interested in 
categories = ["ppron", "i", "we",]
subreddits = ["depression", "yoga"]
data = {'depression': {},
        'yoga': {}}
for category in categories:
    category_data = []
    for subreddit in subreddits:
        data[subreddit][category] = subreddit_dfs[subreddit][category].sum()
contingency_table = pd.DataFrame(data).T

chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"Category: ppron, i, we | Chi-square p-value: {p}")
contingency_table

In [None]:
df = subreddit_dfs['depression']
# Convert UTC timestamp to pandas datetime object
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract the year from the timestamp
df['year'] = df['created_utc'].dt.year

# Group by year and calculate the mean score
mean_scores_by_year = df.groupby('year')['i'].mean()

# Create a line plot to visualize mean scores across the years
mean_scores_by_year.plot(kind='line', marker='o', linestyle='-')
plt.title('Mean Depression SubReddit 1st Person Singular Pronoun Frequencies Over Time')
plt.xlabel('Year')
plt.ylabel('Mean of 1st Person Singular Pronoun Frequencies')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df = subreddit_dfs['yoga']
# Convert UTC timestamp to pandas datetime object
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract the year from the timestamp
df['year'] = df['created_utc'].dt.year

# Group by year and calculate the mean score
mean_scores_by_year = df.groupby('year')['i'].mean()

# Create a line plot to visualize mean scores across the years
mean_scores_by_year.plot(kind='line', marker='o', linestyle='-')
plt.title('Mean Yoga SubReddit 1st Person Singular Pronoun Frequencies Over Time')
plt.xlabel('Year')
plt.ylabel('Mean of 1st Person Singular Pronoun Frequencies')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df = subreddit_dfs['depression']
# Convert UTC timestamp to pandas datetime object
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract the year from the timestamp
df['year'] = df['created_utc'].dt.year

# Group by year and calculate the mean score
mean_scores_by_year = df.groupby('year')['posemo'].mean()

# Create a line plot to visualize mean scores across the years
mean_scores_by_year.plot(kind='line', marker='o', linestyle='-')
plt.title('Mean Depression SubReddit Positive Emotion Frequencies Over Time')
plt.xlabel('Year')
plt.ylabel('Mean of Positive Emotion Frequencies')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df = subreddit_dfs['yoga']
# Convert UTC timestamp to pandas datetime object
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract the year from the timestamp
df['year'] = df['created_utc'].dt.year

# Group by year and calculate the mean score
mean_scores_by_year = df.groupby('year')['posemo'].mean()

# Create a line plot to visualize mean scores across the years
mean_scores_by_year.plot(kind='line', marker='o', linestyle='-')
plt.title('Mean Yoga SubReddit Positive Emotion Frequencies Over Time')
plt.xlabel('Year')
plt.ylabel('Mean of Positive Emotion Frequencies')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df = subreddit_dfs['depression']
# Convert UTC timestamp to pandas datetime object
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract the year from the timestamp
df['year'] = df['created_utc'].dt.year

# Group by year and calculate the mean score
mean_scores_by_year = df.groupby('year')['negemo'].mean()

# Create a line plot to visualize mean scores across the years
mean_scores_by_year.plot(kind='line', marker='o', linestyle='-')
plt.title('Mean Depression SubReddit Negative Emotion Frequencies Over Time')
plt.xlabel('Year')
plt.ylabel('Mean of Negative Emotion Frequencies')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df = subreddit_dfs['yoga']
# Convert UTC timestamp to pandas datetime object
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract the year from the timestamp
df['year'] = df['created_utc'].dt.year

# Group by year and calculate the mean score
mean_scores_by_year = df.groupby('year')['posemo'].mean()

# Create a line plot to visualize mean scores across the years
mean_scores_by_year.plot(kind='line', marker='o', linestyle='-')
plt.title('Mean Yoga SubReddit Negative Emotion Frequencies Over Time')
plt.xlabel('Year')
plt.ylabel('Mean of Negative Emotion Frequencies')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print(subreddit_dfs.keys())

In [None]:
provinces = {'Manitoba': 'Manitoba', 'NovaScotia': 'Nova Scotia', 'PEI': 'Prince Edward Islands', 'Quebec': 'Quebec', 'alberta': 'Alberta', 'britishcolumbia': 'British Columbia','newbrunswickcanada': 'New Brunswick', 'newfoundland': 'New Foundland', 'ontario': 'Ontario', 'saskatchewan': 'Saskatchewan'}
frequencies = {'i': '1st Person Singular Pronoun Frequencies', 'posemo': 'Positive Emotion Frequencies','negemo':'Negative Emotion Frequencies'}

# Iterate over provinces and frequencies
for province_key, province_name in provinces.items():
    for freq_key, freq_name in frequencies.items():
        # Convert UTC timestamp to pandas datetime object
        df = subreddit_dfs[province_key]
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

        # Extract the year from the timestamp
        df['year'] = df['created_utc'].dt.year

        # Group by year and calculate the mean score
        mean_scores_by_year = df.groupby('year')[freq_key].mean()

        # Create a line plot to visualize mean scores across the years
        mean_scores_by_year.plot(kind='line', marker='o', linestyle='-')
        plt.title(f'Mean {province_name} SubReddit {freq_name} Over Time')
        plt.xlabel('Year')
        plt.ylabel(f'Mean of {freq_name}')
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

## Logistic Regression: Province Subreddits
Using the markers that we tested in the last section we will train a logistic regression model on the r/depression subreddit data and use it to make prediction on the r/{province} subreddits

In [None]:
# Create a dictionary for frequencies for each subreddit for the regression
subreddit_freq = {}

# Define the markers identified from chi-square tests
markers = ["Tone", "posemo", "negemo", "ppron", "i", "we"]

subset = df[markers]
# Iterate over subreddits and populate their frequencies
for subreddit in subreddit_dfs.keys():
    frequencies = np.zeros((6, ))
    for i in range(6):
        frequencies[i] = np.sum(subreddit_dfs[subreddit][markers[i]])
    subreddit_freq[subreddit] = frequencies


In [None]:
def create_train_binary_prediction_target_pairs(subreddit_dfs, markers):
    # The training pairs will be as follows
    # predicitons - (1 for r/depression, 0 for r/yoga)
    # target - (1 for markers meeting threshold determined by mean of r/depression, 0 for markers not meeting threshold)
    depression_matrix = np.array(subreddit_dfs['depression'][markers])
    yoga_matrix = np.array(subreddit_dfs['yoga'][markers])
    
    # Find thresholds for markers, operationalized by means of r/subreddit
    mean_tone = np.mean(subreddit_dfs['depression']['Tone'])
    mean_posemo = np.mean(subreddit_dfs['depression']['posemo'])
    mean_negemo = np.mean(subreddit_dfs['depression']['negemo'])
    mean_ppron = np.mean(subreddit_dfs['depression']['ppron'])
    mean_i = np.mean(subreddit_dfs['depression']['i'])
    mean_we = np.mean(subreddit_dfs['depression']['we'])
    
    # Creating masks for based on individual markers
    marker_means = np.array([mean_tone, mean_posemo, mean_negemo, mean_ppron, mean_i, mean_we]) # expect more negative words and personal pronouns but less positive words and more negative tone
    depression_mask = depression_matrix > marker_means
    yoga_mask = yoga_matrix > marker_means

    # Create masks based on total marker presence
    markers_present = np.array([False, False, True, True, True, True])
    depression_mask1 = np.all(depression_mask == markers_present, axis=1)
    yoga_mask1 = np.all(yoga_mask == markers_present, axis=1)
    
    # Create training targets
    depression_targets = np.zeros(depression_mask1.shape)
    depression_targets[depression_mask1] = 1
    depression_targets[~depression_mask1] = 0
    yoga_targets = np.zeros(yoga_mask1.shape)
    yoga_targets[yoga_mask1] = 1
    yoga_targets[~yoga_mask1] = 0
    train_targets = np.hstack((depression_targets, yoga_targets))

    # Create training predictors
    depression_predictors = np.ones(depression_targets.shape)
    yoga_predictors = np.zeros(yoga_targets.shape)
    train_predictors = np.hstack((depression_predictors, yoga_predictors))

    return train_predictors, train_targets


train_predictors, train_targets = create_train_binary_prediction_target_pairs(subreddit_dfs, markers)

# GOOD INDICATION: 125 are true for yoga and 18,487 are true for depression 

In [None]:
def create_test_binary_prediction_target_pairs(subreddit_dfs, subreddit, markers):
    # The training pairs will be as follows
    # predicitons - (1 for r/depression, 0 for r/yoga)
    # target - (1 for markers meeting threshold determined by mean of r/depression, 0 for markers not meeting threshold)
    subreddit_matrix = np.array(subreddit_dfs[subreddit][markers])
    
    # Find thresholds for markers, operationalized by means of r/subreddit
    mean_tone = np.mean(subreddit_dfs['depression']['Tone'])
    mean_posemo = np.mean(subreddit_dfs['depression']['posemo'])
    mean_negemo = np.mean(subreddit_dfs['depression']['negemo'])
    mean_ppron = np.mean(subreddit_dfs['depression']['ppron'])
    mean_i = np.mean(subreddit_dfs['depression']['i'])
    mean_we = np.mean(subreddit_dfs['depression']['we'])
    
    # Creating masks for based on individual markers
    marker_means = np.array([mean_tone, mean_posemo, mean_negemo, mean_ppron, mean_i, mean_we]) # expect more negative words and personal pronouns but less positive words and more negative tone
    subreddit_mask = subreddit_matrix > marker_means

    # Create masks based on total marker presence
    markers_present = np.array([False, False, True, True, True, True])
    subreddit_mask1 = np.all(subreddit_mask == markers_present, axis=1)
    
    # Create training targets
    targets = np.zeros(subreddit_mask1.shape)
    targets[subreddit_mask1] = 1
    targets[~subreddit_mask1] = 0
   

    # Create training predictors
    predictors = np.zeros(targets.shape) # TODO: should this be random

    return predictors, targets

In [None]:
X = train_predictors
Y = train_targets
X = sm.add_constant(X)

# Train the logistic regression model
model = sm.Logit(Y, X)
logit_result = model.fit()
print(logit_result.summary())


In [None]:
# Test the model on posts from other subreddits
for subreddit in subreddit_dfs:
    test_predictors, test_targets = create_test_binary_prediction_target_pairs(subreddit_dfs, subreddit, markers)
    test_predictors = sm.add_constant(test_predictors)
    predictions = logit_result.predict(test_predictors)

    # Convert probabilities to binary predictions (0 or 1)
    binary_predictions = np.round(predictions, decimals=0)
    print(f"Results for {subreddit}:")
    print(classification_report(test_targets, binary_predictions, zero_division=0))