In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn as sk
import nltk
import random
from gensim.models import Word2Vec
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
data_path = "text_comments.csv"

We first explore the data in `text_comments.csv` by printing some rows.

In [None]:
# Create chunker
chunk_size = 40000
tf_chunks = pd.read_csv(data_path, chunksize=chunk_size)

# Display the first 10 rows
print(tf_chunks.get_chunk(10))

We explore how many comments a subreddit could have.

In [None]:
first_chunk = next(tf_chunks)
first_chunk["subreddit"].value_counts(dropna=False)

Next, we introduce the sentiment analysis tool called VADER (Valence Aware Dictionary and sEntiment Reasoner). The sentiment scores are represented as a dictionary with the following keys:

    'neg': Negative sentiment score (proportion of the text that is negative)
    'neu': Neutral sentiment score (proportion of the text that is neutral)
    'pos': Positive sentiment score (proportion of the text that is positive)
    'compound': Compound sentiment score (a normalized, weighted composite score that represents the overall sentiment of the text)


In [None]:
# Sample text (replace this with your own data)
text = "I really enjoyed working with my team. They are so helpful and supportive."

# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Get the sentiment scores
sentiment_scores = sid.polarity_scores(text)

# Display the sentiment scores
print("Sentiment Scores:", sentiment_scores)

Now, we take a sample of the dataset 100 to 1. (Do not run this, we have made the sample already)

In [None]:
# Specify the number of rows to read in each chunk
chunk_size = 1000

# Specify the number of rows to sample from each chunk
sample_size = 10

# Specify the path for the output CSV file
output_csv_path = 'sample.csv'

# Create a CSV writer object for the first chunk
first_chunk = True

# Create a CSV reader object
csv_reader = pd.read_csv(data_path, chunksize=chunk_size, encoding='utf-8')

# Iterate over each chunk, sample 10 rows, and append them to the output CSV file
for i, chunk in enumerate(csv_reader):
    try:
      # Sample 10 rows from each chunk
      sampled_chunk = chunk.sample(n=sample_size, random_state=42)  # Adjust random_state as needed

      # Append the sampled chunk to the output CSV file
      sampled_chunk.to_csv(output_csv_path, mode='a', index=False, header=first_chunk)

      # Update the first_chunk flag after the first iteration
      first_chunk = False
    except:
      print(f"Error in chunk {i + 1}: {e}")

We explore the sample dataset.

In [None]:
df = pd.read_csv('sample.csv')
df.head(15)

We find and remove "deleted" users or "removed" comments and any null values, and calculate the resulting number of rows.

In [None]:
# Garbage count
garbage_rows = (
    df['author'].isin(['', '[deleted]']) | df['author'].isna() |
    df['body'].isin(['', '[removed]']) | df['body'].isna()
)
garbage_count = df[garbage_rows].shape[0]
garbage_count

In [None]:
# Total count
df.shape[0]

In [None]:
# Delete garbage rows from the DataFrame
df = df[~garbage_rows]
df.shape[0]

We group by `author` and concatenate the body together to produce one aggregated string to be fed into the sentiment analyzer.

In [None]:
# Group by 'author' and concatenate 'body' strings
grouped_df = df.groupby('author')['body'].agg(lambda x: ' '.join(x)).reset_index()
grouped_df.head(10)

We create function that takes string as text and produces a sentiment number.

In [None]:
def sentiment(text: str) -> float:
  return sid.polarity_scores(text)['compound']

We apply the function to generate the sentiment score for a sample of 10 users, and print the entries out to check whether the generated scores makes sense for each text body. With the exception of `JakeFitzy7` and `proximateprose`, the sentiment score seems to make sense. The sentiment analyzer was able to get 8/10 right, which is better than average (at least for this sample). At first glance, it seems to work better for true positives compared to true negatives.

In [None]:
demo_sample = grouped_df.sample(n=10, random_state=42)
demo_sample['sentiment'] = demo_sample['body'].apply(sentiment)

# Display the sampled DataFrame with sentiment
for index, row in demo_sample.iterrows():
    print(f"Author: {row['author']}\nBody: {row['body']}\nSentiment: {row['sentiment']}\n")

GS **Part**

In [None]:
# Get Bigger Random Sample to train community embeddings
full_size = len(df)*100
train_size = 400000
meta_cols = [3,4,6]

print(full_size)

np.random.seed(0) # fix seed
to_skip = np.arange(1, full_size)
np.random.shuffle(to_skip)
to_skip = to_skip[:full_size-train_size+1]
to_skip




In [None]:
train_df = pd.read_csv(data_path, skiprows=to_skip, usecols=meta_cols)


Identify unique authors and subreddits, and observe distribution of posts

In [None]:

#print(df)
# add symbols to user names to make them distinguishable from subreddits in embedding
sign = '/'

df.author = df.author.apply(lambda a: sign + a)


uniqueAuthors = df.groupby('author').agg({'score': 'size'}).rename(columns={'score': 'count'})
uniqueReddits = df.groupby('subreddit').agg({'score': 'size'}).rename(columns={'score': 'count'})

#train_uniqueAuthors = train_df.groupby('author').agg({'score': 'size'}).rename(columns={'score': 'count'})
#train_uniqueReddits = train_df.groupby('subreddit').agg({'score': 'size'}).rename(columns={'score': 'count'})

In [None]:

print(df.head(5))

plt.loglog(uniqueAuthors[:200].sort_values(by='count', ascending=False))
plt.title('Author Contributions')
plt.ylabel('number of contributions')
plt.xlabel('author')
plt.xticks(rotation=90)
plt.grid()
plt.show()

plt.loglog(uniqueReddits[:200].sort_values(by='count', ascending=False))
plt.title('Subreddit Activity')
plt.ylabel('number of contributions')
plt.xlabel('subreddit')
plt.xticks(rotation=90)
plt.grid()
plt.show()



Generate Pairs of D

In [None]:
# we need to generate (ci, uj) for every user j who commented in community ci
communityUsers = df.groupby(by=['subreddit', 'author']).agg({'score':'size', 'created_utc':'min'})
communityUsers.head(5)

In [None]:
train_tuples = communityUsers.index
print(train_tuples)

alpha = .18
size = 150

In [None]:
model = Word2Vec(train_tuples, alpha=alpha, vector_size=size)
model.train(train_tuples, total_examples=len(train_tuples), epochs=2)

In [None]:
# Extract Community Embeddings into DataFrame
wvs = model.wv.get_normed_vectors()
keys = model.wv.key_to_index.keys()

communityEmbeddings = pd.DataFrame(wvs, index=keys)

communityEmbeddings.drop(index=communityEmbeddings[communityEmbeddings.index.str.startswith(sign)].index, inplace=True)
communityEmbeddings.tail(10)
print(communityEmbeddings)

In [None]:
from sklearn.decomposition import PCA

wv_pca = PCA(n_components=3).fit_transform(communityEmbeddings)
pca_df = pd.DataFrame(wv_pca, index=communityEmbeddings.index, columns=['x','y','z'])
print(pca_df)

import plotly.express as px
fig = px.scatter_3d(pca_df[:200], x='x', y='y', z='z', text=communityEmbeddings.index[:200])
fig.show(renderer='colab')


In [None]:
# Calculate COM for all users
uniqueAuthors['com'] = 0

print(len(uniqueReddits))
print(uniqueReddits.loc['SaltLakeCity'])
#print('SaltLakeCity' in keys)

"""
# Use this when dataset is absurdly large to track progress
batch_size = 1000
for start, end in zip(range(0,len(df)-batch_size, batch_size),
                      range(batch_size+1, len(df),
                            batch_size)):
  temp = df.iloc[start:end].groupby('author').agg(
      {'subreddit': lambda srs: communityEmbeddings.loc[srs].mean().to_list()})
  print(temp)
  uniqueAuthors.loc[temp.index, 'com'] = temp.subreddit
  print('start:', start)
"""
print(communityEmbeddings.index)

#communityEmbeddings.loc['Kikpals'] #, 'bostonr4r', 'Kikpals', 'feckingbirds']]
# Filter out subreddits not in embedding

df1 = df[df.subreddit.isin(communityEmbeddings.index)]
print(df1)

temp = df1.groupby('author').agg({'subreddit':
        lambda srs: communityEmbeddings.loc[srs].mean().to_list()})
uniqueAuthors.loc[temp.index, 'com'] = temp.subreddit

print(uniqueAuthors)

In [None]:
# Calculate GS Score


#print(sample)
#print(uniqueAuthors.loc[sample.index.get_level_values('author'), 'com'], communityEmbeddings.loc[sample.index.get_level_values('subreddit')])
#sample['dot'] = np.dot(uniqueAuthors.loc[sample.index.get_level_values('author'), 'com'],
 #                      communityEmbeddings.loc[sample.index.get_level_values('subreddit')])




# Omit authors for whom com was not computed
com_computed = uniqueAuthors.com != 0
curatedAuthors = uniqueAuthors.loc[com_computed]

# compute dot products
def get_gs(srs):
  gs = srs.apply(lambda s:
                  np.dot(communityEmbeddings.loc[s, :].array,
                         curatedAuthors.loc[df1.loc[srs.index[0]].author, 'com'])).sum()

  gs = gs / srs.size
  norm = np.linalg.norm(curatedAuthors.loc[df1.loc[srs.index[0]].author, 'com'])
  gs = gs / norm


  return gs

uniqueAuthors['gs'] = -1
gs = df1.groupby('author').agg({'subreddit': get_gs}).subreddit
print(gs)

gs.plot.hist(bins=20)

"""
# This is a partial calculation (we still need to divide by com norm and )
temp = df1.groupby('author').agg(
    {'subreddit':
       lambda srs: (1. / srs.nunique()) \
       * srs.apply(lambda v: np.dot(v, uniqueAuthors[df1.loc[srs.index].author)) \
        })
uniqueAuthors.loc[temp.index, 'gs'] = temp.subreddit


communityEmbeddings.loc[df1.loc[df1.author == x.index].subreddit].apply(
        lambda v:
        np.dot(v, uniqueAuthors.loc[x.index].com)"""


""




In [None]:
plt.hist(x=gs, bins=np.linspace(0, 2, 50))

In [None]:
uniqueAuthors.gs = gs

In [None]:
np.linalg.norm?
print(curatedAuthors.com.values)

norms = curatedAuthors.com.apply(lambda v: np.linalg.norm(v))


In [None]:

print(temp, norms)

gs = temp / norms
#gs /= df1.groupby('author').agg({'subreddit': 'nunique'}).subreddit

print(gs)

ads = df1.groupby('author').agg('text').apply(sentiment)

In [None]:
uniqueAuthors.ad = ads

plt.scatter(uniqueAuthors['ad'], uniqueAuthors['gs'])
plt.xlabel('ad')
plt.ylabel('gs')
plt.title('Scatter Plot of ad vs gs')
plt.show()

correlation = uniqueAuthors['ad'].corr(uniqueAuthors['gs'])

print(f"Correlation between 'ad' and 'gs': {correlation}")