In [None]:
#import packages

import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb

from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import scipy.stats as stats

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

from pylab import rcParams

import os

%matplotlib inline

### Submission Data

In [None]:
directory = 'TerraLuna'

In [None]:
def process_data(path):
    df=pd.read_csv(path)
    if not(df.empty):
        df['date']=pd.to_datetime(df['created_utc'],unit='s').dt.date
        df = df[['id','subreddit', 'selftext', 'title', 'date', 'author']]
#         df=df.drop('Unnamed: 0', axis=1)
    return df

In [None]:
df_terraluna=pd.DataFrame()

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f) and f.endswith(".csv"):
        df_terraluna=pd.concat([df_terraluna, process_data(f)])

# Drop unneccessary column
df_terraluna=df_terraluna.drop('Unnamed: 0', axis=1)

# Reset Index
df_terraluna=df_terraluna.reset_index()

# Drop Index
df_terraluna=df_terraluna.drop('index', axis=1)

# Remove [removed]
df_terraluna = df_terraluna[df_terraluna.selftext!='[removed]']

# Drop dupes
df_terraluna = df_terraluna.drop_duplicates(subset=['id'])

In [None]:
len(df_terraluna)

In [None]:
df_terraluna=df_terraluna.fillna('')

In [None]:
# Combine title and selftext

df_terraluna['all_text'] = df_terraluna.selftext.astype(str) + ' ' + df_terraluna.title.astype(str)
df_terraluna.head()

In [None]:
# Make text lower
df_terraluna['all_text'] = df_terraluna['all_text'].astype(str).str.lower()
df_terraluna.head()

In [None]:
# count how many submissions per day

pd.pivot_table(df_terraluna, index='date', values='all_text', columns='subreddit', aggfunc='count')

### Comments Data

In [None]:
df_terraluna_comments = pd.read_csv('TerraLuna comments/terraluna_comments.csv', index_col=0,
                 lineterminator='\n')

# Clean Removed Comments
df_terraluna_comments = df_terraluna_comments[df_terraluna_comments.Comment!='[removed]']

# Get cleaned parent ID
df_terraluna_comments['parent_id_clean']=df_terraluna_comments['Parent ID'].str[3:]

df_terraluna_comments=df_terraluna_comments.rename(columns={"Submission ID": "submission_id"
                                     , "Parent ID": "parent_id"
                                     , "Comment ID": "comment_id"
                                     , "Comment": "comment"
                                     , "Author": 'author'}, errors="raise")

In [None]:
len(df_terraluna_comments)

In [None]:
df_terraluna_comments.head()

In [None]:
# Fill NA
df_terraluna_comments=df_terraluna_comments.fillna('')

# Make text lower
df_terraluna_comments['comment'] = df_terraluna_comments['comment'].astype(str).str.lower()

# Get dates
df_terraluna_lookup = df_terraluna[['id','date', 'subreddit']]
df_terraluna_lookup_date = df_terraluna_lookup.set_index("id").loc[:, "date"]
df_terraluna_lookup_subreddit = df_terraluna_lookup.set_index("id").loc[:, "subreddit"]
df_terraluna_comments=df_terraluna_comments.assign(date=df_terraluna_comments.submission_id.map(df_terraluna_lookup_date))
df_terraluna_comments=df_terraluna_comments.assign(subreddit=df_terraluna_comments.submission_id.map(df_terraluna_lookup_subreddit))
df_terraluna_comments.head()

### Combine Data

In [None]:
df_terraluna.head()

In [None]:
# Get columns for main data

df_terraluna_combine = df_terraluna[['id', 'subreddit', 'all_text', 'date', 'author']]
df_terraluna_combine.head()

In [None]:
df_terraluna_comments.head()

In [None]:
# Get columns for comments

df_terraluna_comments = df_terraluna_comments.rename(columns={'comment': 'all_text', 'comment_id':'id'})
df_terraluna_comments_combine = df_terraluna_comments[['id', 'subreddit', 'all_text', 'date', 'author', 'parent_id_clean', 'submission_id']]
df_terraluna_comments_combine.head()

In [None]:
# Combine data

df_all_combined = pd.concat([df_terraluna_combine, df_terraluna_comments_combine])
df_all_combined = df_all_combined.reset_index()
df_all_combined = df_all_combined.drop(['index'], axis = 1)
df_all_combined.head()

In [None]:
# Get parent author

df_all_combined_lookup = df_all_combined[['id','author']]
df_all_combined_lookup = df_all_combined_lookup.drop_duplicates()
df_all_combined_lookup.head()

df_all_combined_lookup = df_all_combined_lookup.set_index("id").loc[:, "author"]

df_all_combined=df_all_combined.assign(parent_author=df_all_combined.parent_id_clean.map(df_all_combined_lookup))

df_all_combined.head()

In [None]:
# Add type of post

df_all_combined['type'] = np.nan

df_all_combined['type'] = np.where(df_all_combined['parent_id_clean'].isna(), 'submission', 'comment')

In [None]:
df_all_combined.head()

### EDA

In [None]:
# count how many submissions per day
pd.pivot_table(df_all_combined, index='date', values='all_text', columns=['subreddit', 'type'] , aggfunc='count')

### Preprocess Data

In [None]:
# NLTK Package and Regex Tokenizer

import nltk
from nltk.tokenize import RegexpTokenizer

regexp = RegexpTokenizer('\w+')

In [None]:
# Replace words meant to be together into one word

df_all_combined['all_text'] = df_all_combined['all_text'].str.replace('crypto.com', 'cryptocom')
df_all_combined['all_text'] = df_all_combined['all_text'].str.replace('do kwon', 'dokwon')
df_all_combined['all_text'] = df_all_combined['all_text'].str.replace("do kwon’s", "dokwon’s")

# Tokenize all_text

df_all_combined['text_token']=df_all_combined['all_text'].apply(regexp.tokenize)

# Get English Stopwords
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words("english")

# Extend the list with custom stopwords
my_stopwords = ['https', 'nan', 'removed', 'amp', 'x200b', 'com', 'www', '000']
stopwords.extend(my_stopwords)

df_all_combined['text_token'] = df_all_combined['text_token'].apply(lambda x: [item for item in x if item not in stopwords])
df_all_combined.head()

In [None]:
# Only keep words with length more than 2

df_all_combined['text_string'] = df_all_combined['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
df_all_combined.head()

In [None]:
nltk.download('punkt')

In [None]:
# Create list of all words
all_words = ' '.join([word for word in df_all_combined['text_string']])

# Tokenize all words
tokenized_words = nltk.tokenize.word_tokenize(all_words)

In [None]:
# Get distribution of words

from nltk.probability import FreqDist

fdist = FreqDist(tokenized_words)
fdist.most_common(500)

In [None]:
# Only get words with occurence of 5 or more

df_all_combined['text_string_fdist'] = df_all_combined['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 5 ]))
df_all_combined.head()


In [None]:
# Download for Lemmatization

nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer

wordnet_lem = WordNetLemmatizer()

# get lemmatized words
df_all_combined['text_string_lem'] = df_all_combined['text_string_fdist'].apply(wordnet_lem.lemmatize)

# check if the columns are equal
df_all_combined['is_equal']=(df_all_combined['text_string_fdist']==df_all_combined['text_string_lem'])
df_all_combined.head()

In [None]:
# show level count

df_all_combined.is_equal.value_counts()

In [None]:
all_words_lem = ' '.join([word for word in df_all_combined['text_string_lem']])

In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

words = nltk.word_tokenize(all_words_lem)
fd = FreqDist(words)

In [None]:
fd.most_common(500)

In [None]:
# Obtain top 10 words
top_10 = fd.most_common(10)

# Create pandas series to make plotting easier
fdist = pd.Series(dict(top_10))

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")

sns.barplot(y=fdist.index, x=fdist.values, color='blue');

In [None]:
df_all_combined.head()

### Sentiment Analysis

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [None]:
# Submission Data

# Using Polarity
df_all_combined['polarity'] = df_all_combined['text_string_lem'].apply(lambda x: analyzer.polarity_scores(x))
df_all_combined.tail()

In [None]:
# Change data structure
df_all_combined = pd.concat(
    [df_all_combined, 
     df_all_combined['polarity'].apply(pd.Series)], axis=1)
df_all_combined.head()

In [None]:
df_all_combined['compound']

In [None]:
# Create new sentiment variable
df_all_combined['sentiment'] = df_all_combined['compound'].apply(lambda x: 'positive' if x >0 else 'neutral' if x==0 else 'negative')
df_all_combined.head()

In [None]:
# Number of submissions/comments by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined, 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# Number of submissions by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined[df_all_combined.type=='submission'], 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# Number of comments by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined[df_all_combined.type=='comment'], 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# count how many submissions per day
pd.pivot_table(df_all_combined, index='date', values='all_text', columns=['sentiment', 'type'] , aggfunc='count')

### Network Analysis

In [None]:
# Get all edges 

df_all_combined_networkx_edges = df_all_combined[['author', 'parent_author']]
df_all_combined_networkx_edges = df_all_combined_networkx_edges[df_all_combined_networkx_edges.parent_author.notna()]

df_all_combined_networkx_edges = df_all_combined_networkx_edges[df_all_combined_networkx_edges.author!='ccModBot']
df_all_combined_networkx_edges = df_all_combined_networkx_edges[df_all_combined_networkx_edges.parent_author!='ccModBot']

df_all_combined_networkx_edges = df_all_combined_networkx_edges[df_all_combined_networkx_edges.author!='AutoModerator']
df_all_combined_networkx_edges = df_all_combined_networkx_edges[df_all_combined_networkx_edges.parent_author!='AutoModerator']

df_all_combined_networkx_edges = df_all_combined_networkx_edges[df_all_combined_networkx_edges.author!='']
df_all_combined_networkx_edges = df_all_combined_networkx_edges[df_all_combined_networkx_edges.parent_author!='']

df_all_combined_networkx_edges.head()

In [None]:
# Get all nodes 

df_all_combined_networkx_nodes = df_all_combined[['author']]
df_all_combined_networkx_nodes = df_all_combined_networkx_nodes.drop_duplicates()

df_all_combined_networkx_nodes = df_all_combined_networkx_nodes[df_all_combined_networkx_nodes.author!='ccModBot']
df_all_combined_networkx_nodes = df_all_combined_networkx_nodes[df_all_combined_networkx_nodes.author!='AutoModerator']
df_all_combined_networkx_nodes = df_all_combined_networkx_nodes[df_all_combined_networkx_nodes.author!='']

df_all_combined_networkx_nodes.head()

In [None]:
print(len(df_all_combined_networkx_edges))
print(len(df_all_combined_networkx_nodes))

In [None]:
# Try add weights from frequency of edges

df_all_combined_networkx_edges['weight'] = df_all_combined_networkx_edges.groupby(['author', 'parent_author'])['author'].transform('size')
df_all_combined_networkx_edges.head()


In [None]:
import networkx as nx

# Create an empty graph
G = nx.Graph()

In [None]:
# Add nodes to the graph
for author in df_all_combined_networkx_nodes.author:
    G.add_node(author)

In [None]:
# Add edges using pandas, to incorporate weights

G = nx.from_pandas_edgelist(df_all_combined_networkx_edges, 'author', 'parent_author',
                            create_using=nx.Graph(), edge_attr='weight')

In [None]:
# Print the number of nodes and edges in the graph
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())

In [None]:
# Calculate centrality measures
degree_centrality = nx.degree_centrality(G)

# Print the most central nodes based on degree centrality
print("Top 5 nodes based on degree centrality:")
sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
for node, centrality in sorted_degree_centrality[:5]:
    print(f"{node}: {centrality}")

In [None]:
# Calculate centrality measures 2
closeness_centrality = nx.closeness_centrality(G)

# Print the most central nodes based on closeness centrality
print("Top 5 nodes based on closeness centrality:")
sorted_closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)
for node, centrality in sorted_closeness_centrality[:5]:
    print(f"{node}: {centrality}")

In [None]:
# Calculate centrality measures 3
betweenness_centrality = nx.betweenness_centrality(G)

# Print the most central nodes based on betweenness centrality
print("Top 5 nodes based on betweenness centrality:")
sorted_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
for node, centrality in sorted_betweenness_centrality[:5]:
    print(f"{node}: {centrality}")

In [None]:
# Calculate centrality measures 4
eigenvector_centrality = nx.eigenvector_centrality(G)

# Print the most central nodes based on eigen centrality
print("Top 5 nodes based on eigen centrality:")
sorted_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)
for node, centrality in sorted_eigenvector_centrality[:5]:
    print(f"{node}: {centrality}")

### Subgraphs

In [None]:
# Plot only nodes with highest eigen centrality

sorted_eigenvector_centrality_2 = sorted(eigenvector_centrality, key=eigenvector_centrality.get, reverse=True)
top_nodes = sorted_eigenvector_centrality_2[:1]  # Select top node only

# top_nodes_and_neighbors = []

# for node in top_nodes:
#     for nodex in G.neighbors(node):
#         top_nodes_and_neighbors.append(nodex)
#         top_nodes_and_neighbors = list(set(top_nodes_and_neighbors))
        
# subgraph = G.subgraph(top_nodes_and_neighbors)
subgraph = G.subgraph(top_nodes)

elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] > 0.5]
esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] <= 0.5]

pos = nx.spring_layout(G, seed=7)  # positions for all nodes - seed for reproducibility

# nodes
nx.draw_networkx_nodes(G, pos, node_size=700)

# edges
nx.draw_networkx_edges(G, pos, edgelist=elarge, width=6)
nx.draw_networkx_edges(
    G, pos, edgelist=esmall, width=6, alpha=0.5, edge_color="b", style="dashed"
)

# node labels
nx.draw_networkx_labels(G, pos, font_size=20, font_family="sans-serif")
# edge weight labels
edge_labels = nx.get_edge_attributes(G, "weight")
nx.draw_networkx_edge_labels(G, pos, edge_labels)

ax = plt.gca()
ax.margins(0.08)
plt.axis("off")
plt.tight_layout()
plt.show()

### Split Data into Five Phases

In [None]:
import datetime

In [None]:
df_all_combined_phase1 = df_all_combined[df_all_combined.date<=datetime.datetime.strptime('2022-05-06', '%Y-%m-%d').date()]
df_all_combined_phase2 = df_all_combined[(df_all_combined.date>=datetime.datetime.strptime('2022-05-07', '%Y-%m-%d').date()) & (df_all_combined.date<=datetime.datetime.strptime('2022-05-09', '%Y-%m-%d').date())]
df_all_combined_phase3 = df_all_combined[(df_all_combined.date>=datetime.datetime.strptime('2022-05-10', '%Y-%m-%d').date()) & (df_all_combined.date<=datetime.datetime.strptime('2022-05-13', '%Y-%m-%d').date())]
df_all_combined_phase4 = df_all_combined[(df_all_combined.date>=datetime.datetime.strptime('2022-05-14', '%Y-%m-%d').date()) & (df_all_combined.date<=datetime.datetime.strptime('2022-05-28', '%Y-%m-%d').date())]
df_all_combined_phase5 = df_all_combined[(df_all_combined.date>=datetime.datetime.strptime('2022-05-29', '%Y-%m-%d').date())]

In [None]:
# Number of submissions/comments by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined_phase1, 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# Number of submissions/comments by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined_phase2, 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# Number of submissions/comments by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined_phase3, 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# Number of submissions/comments by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined_phase4, 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# Number of submissions/comments by sentiment
sns.countplot(y='sentiment', 
             data=df_all_combined_phase5, 
             order=['positive', 'neutral', 'negative'], 
             palette=['#b2d8d8',"#008080", '#db3d13']
             );

In [None]:
# Get all edges 

df_all_combined_networkx_edges_phase1 = df_all_combined_phase1[['author', 'parent_author']]
df_all_combined_networkx_edges_phase1 = df_all_combined_networkx_edges_phase1[df_all_combined_networkx_edges_phase1.parent_author.notna()]

df_all_combined_networkx_edges_phase1 = df_all_combined_networkx_edges_phase1[df_all_combined_networkx_edges_phase1.author!='ccModBot']
df_all_combined_networkx_edges_phase1 = df_all_combined_networkx_edges_phase1[df_all_combined_networkx_edges_phase1.parent_author!='ccModBot']

df_all_combined_networkx_edges_phase1 = df_all_combined_networkx_edges_phase1[df_all_combined_networkx_edges_phase1.author!='AutoModerator']
df_all_combined_networkx_edges_phase1 = df_all_combined_networkx_edges_phase1[df_all_combined_networkx_edges_phase1.parent_author!='AutoModerator']

df_all_combined_networkx_edges_phase1 = df_all_combined_networkx_edges_phase1[df_all_combined_networkx_edges_phase1.author!='']
df_all_combined_networkx_edges_phase1 = df_all_combined_networkx_edges_phase1[df_all_combined_networkx_edges_phase1.parent_author!='']

# Get all nodes 

df_all_combined_networkx_nodes_phase1 = df_all_combined_phase1[['author']]
df_all_combined_networkx_nodes_phase1 = df_all_combined_networkx_nodes_phase1.drop_duplicates()

df_all_combined_networkx_nodes_phase1 = df_all_combined_networkx_nodes_phase1[df_all_combined_networkx_nodes_phase1.author!='ccModBot']
df_all_combined_networkx_nodes_phase1 = df_all_combined_networkx_nodes_phase1[df_all_combined_networkx_nodes_phase1.author!='AutoModerator']
df_all_combined_networkx_nodes_phase1 = df_all_combined_networkx_nodes_phase1[df_all_combined_networkx_nodes_phase1.author!='']

# Try add weights from frequency of edges

df_all_combined_networkx_edges_phase1['weight'] = df_all_combined_networkx_edges_phase1.groupby(['author', 'parent_author'])['author'].transform('size')
df_all_combined_networkx_edges_phase1.head()

print(len(df_all_combined_networkx_edges_phase1))
print(len(df_all_combined_networkx_nodes_phase1))


In [None]:
# Get all edges 

df_all_combined_networkx_edges_phase2 = df_all_combined_phase2[['author', 'parent_author']]
df_all_combined_networkx_edges_phase2 = df_all_combined_networkx_edges_phase2[df_all_combined_networkx_edges_phase2.parent_author.notna()]

df_all_combined_networkx_edges_phase2 = df_all_combined_networkx_edges_phase2[df_all_combined_networkx_edges_phase2.author!='ccModBot']
df_all_combined_networkx_edges_phase2 = df_all_combined_networkx_edges_phase2[df_all_combined_networkx_edges_phase2.parent_author!='ccModBot']

df_all_combined_networkx_edges_phase2 = df_all_combined_networkx_edges_phase2[df_all_combined_networkx_edges_phase2.author!='AutoModerator']
df_all_combined_networkx_edges_phase2 = df_all_combined_networkx_edges_phase2[df_all_combined_networkx_edges_phase2.parent_author!='AutoModerator']

df_all_combined_networkx_edges_phase2 = df_all_combined_networkx_edges_phase2[df_all_combined_networkx_edges_phase2.author!='']
df_all_combined_networkx_edges_phase2 = df_all_combined_networkx_edges_phase2[df_all_combined_networkx_edges_phase2.parent_author!='']

# Get all nodes 

df_all_combined_networkx_nodes_phase2 = df_all_combined_phase2[['author']]
df_all_combined_networkx_nodes_phase2 = df_all_combined_networkx_nodes_phase2.drop_duplicates()

df_all_combined_networkx_nodes_phase2 = df_all_combined_networkx_nodes_phase2[df_all_combined_networkx_nodes_phase2.author!='ccModBot']
df_all_combined_networkx_nodes_phase2 = df_all_combined_networkx_nodes_phase2[df_all_combined_networkx_nodes_phase2.author!='AutoModerator']
df_all_combined_networkx_nodes_phase2 = df_all_combined_networkx_nodes_phase2[df_all_combined_networkx_nodes_phase2.author!='']

# Try add weights from frequency of edges

df_all_combined_networkx_edges_phase2['weight'] = df_all_combined_networkx_edges_phase2.groupby(['author', 'parent_author'])['author'].transform('size')
df_all_combined_networkx_edges_phase2.head()

print(len(df_all_combined_networkx_edges_phase2))
print(len(df_all_combined_networkx_nodes_phase2))


In [None]:
# Get all edges 

df_all_combined_networkx_edges_phase3 = df_all_combined_phase3[['author', 'parent_author']]
df_all_combined_networkx_edges_phase3 = df_all_combined_networkx_edges_phase3[df_all_combined_networkx_edges_phase3.parent_author.notna()]

df_all_combined_networkx_edges_phase3 = df_all_combined_networkx_edges_phase3[df_all_combined_networkx_edges_phase3.author!='ccModBot']
df_all_combined_networkx_edges_phase3 = df_all_combined_networkx_edges_phase3[df_all_combined_networkx_edges_phase3.parent_author!='ccModBot']

df_all_combined_networkx_edges_phase3 = df_all_combined_networkx_edges_phase3[df_all_combined_networkx_edges_phase3.author!='AutoModerator']
df_all_combined_networkx_edges_phase3 = df_all_combined_networkx_edges_phase3[df_all_combined_networkx_edges_phase3.parent_author!='AutoModerator']

df_all_combined_networkx_edges_phase3 = df_all_combined_networkx_edges_phase3[df_all_combined_networkx_edges_phase3.author!='']
df_all_combined_networkx_edges_phase3 = df_all_combined_networkx_edges_phase3[df_all_combined_networkx_edges_phase3.parent_author!='']

# Get all nodes 

df_all_combined_networkx_nodes_phase3 = df_all_combined_phase3[['author']]
df_all_combined_networkx_nodes_phase3 = df_all_combined_networkx_nodes_phase3.drop_duplicates()

df_all_combined_networkx_nodes_phase3 = df_all_combined_networkx_nodes_phase3[df_all_combined_networkx_nodes_phase3.author!='ccModBot']
df_all_combined_networkx_nodes_phase3 = df_all_combined_networkx_nodes_phase3[df_all_combined_networkx_nodes_phase3.author!='AutoModerator']
df_all_combined_networkx_nodes_phase3 = df_all_combined_networkx_nodes_phase3[df_all_combined_networkx_nodes_phase3.author!='']

# Try add weights from frequency of edges

df_all_combined_networkx_edges_phase3['weight'] = df_all_combined_networkx_edges_phase3.groupby(['author', 'parent_author'])['author'].transform('size')
df_all_combined_networkx_edges_phase3.head()

print(len(df_all_combined_networkx_edges_phase3))
print(len(df_all_combined_networkx_nodes_phase3))


In [None]:
# Get all edges 

df_all_combined_networkx_edges_phase4 = df_all_combined_phase4[['author', 'parent_author']]
df_all_combined_networkx_edges_phase4 = df_all_combined_networkx_edges_phase4[df_all_combined_networkx_edges_phase4.parent_author.notna()]

df_all_combined_networkx_edges_phase4 = df_all_combined_networkx_edges_phase4[df_all_combined_networkx_edges_phase4.author!='ccModBot']
df_all_combined_networkx_edges_phase4 = df_all_combined_networkx_edges_phase4[df_all_combined_networkx_edges_phase4.parent_author!='ccModBot']

df_all_combined_networkx_edges_phase4 = df_all_combined_networkx_edges_phase4[df_all_combined_networkx_edges_phase4.author!='AutoModerator']
df_all_combined_networkx_edges_phase4 = df_all_combined_networkx_edges_phase4[df_all_combined_networkx_edges_phase4.parent_author!='AutoModerator']

df_all_combined_networkx_edges_phase4 = df_all_combined_networkx_edges_phase4[df_all_combined_networkx_edges_phase4.author!='']
df_all_combined_networkx_edges_phase4 = df_all_combined_networkx_edges_phase4[df_all_combined_networkx_edges_phase4.parent_author!='']

# Get all nodes 

df_all_combined_networkx_nodes_phase4 = df_all_combined_phase4[['author']]
df_all_combined_networkx_nodes_phase4 = df_all_combined_networkx_nodes_phase4.drop_duplicates()

df_all_combined_networkx_nodes_phase4 = df_all_combined_networkx_nodes_phase4[df_all_combined_networkx_nodes_phase4.author!='ccModBot']
df_all_combined_networkx_nodes_phase4 = df_all_combined_networkx_nodes_phase4[df_all_combined_networkx_nodes_phase4.author!='AutoModerator']
df_all_combined_networkx_nodes_phase4 = df_all_combined_networkx_nodes_phase4[df_all_combined_networkx_nodes_phase4.author!='']

# Try add weights from frequency of edges

df_all_combined_networkx_edges_phase4['weight'] = df_all_combined_networkx_edges_phase4.groupby(['author', 'parent_author'])['author'].transform('size')
df_all_combined_networkx_edges_phase4.head()

print(len(df_all_combined_networkx_edges_phase4))
print(len(df_all_combined_networkx_nodes_phase4))


In [None]:
# Get all edges 

df_all_combined_networkx_edges_phase5 = df_all_combined_phase5[['author', 'parent_author']]
df_all_combined_networkx_edges_phase5 = df_all_combined_networkx_edges_phase5[df_all_combined_networkx_edges_phase5.parent_author.notna()]

df_all_combined_networkx_edges_phase5 = df_all_combined_networkx_edges_phase5[df_all_combined_networkx_edges_phase5.author!='ccModBot']
df_all_combined_networkx_edges_phase5 = df_all_combined_networkx_edges_phase5[df_all_combined_networkx_edges_phase5.parent_author!='ccModBot']

df_all_combined_networkx_edges_phase5 = df_all_combined_networkx_edges_phase5[df_all_combined_networkx_edges_phase5.author!='AutoModerator']
df_all_combined_networkx_edges_phase5 = df_all_combined_networkx_edges_phase5[df_all_combined_networkx_edges_phase5.parent_author!='AutoModerator']

df_all_combined_networkx_edges_phase5 = df_all_combined_networkx_edges_phase5[df_all_combined_networkx_edges_phase5.author!='']
df_all_combined_networkx_edges_phase5 = df_all_combined_networkx_edges_phase5[df_all_combined_networkx_edges_phase5.parent_author!='']

# Get all nodes 

df_all_combined_networkx_nodes_phase5 = df_all_combined_phase5[['author']]
df_all_combined_networkx_nodes_phase5 = df_all_combined_networkx_nodes_phase5.drop_duplicates()

df_all_combined_networkx_nodes_phase5 = df_all_combined_networkx_nodes_phase5[df_all_combined_networkx_nodes_phase5.author!='ccModBot']
df_all_combined_networkx_nodes_phase5 = df_all_combined_networkx_nodes_phase5[df_all_combined_networkx_nodes_phase5.author!='AutoModerator']
df_all_combined_networkx_nodes_phase5 = df_all_combined_networkx_nodes_phase5[df_all_combined_networkx_nodes_phase5.author!='']

# Try add weights from frequency of edges

df_all_combined_networkx_edges_phase5['weight'] = df_all_combined_networkx_edges_phase5.groupby(['author', 'parent_author'])['author'].transform('size')
df_all_combined_networkx_edges_phase5.head()

print(len(df_all_combined_networkx_edges_phase5))
print(len(df_all_combined_networkx_nodes_phase5))


In [None]:
# Create an empty graph
G_phase1 = nx.Graph()

# Add nodes to the graph
for author in df_all_combined_networkx_nodes_phase1.author:
    G_phase1.add_node(author)
    
# Add edges using pandas, to incorporate weights

G_phase1 = nx.from_pandas_edgelist(df_all_combined_networkx_edges_phase1, 'author', 'parent_author',
                            create_using=nx.Graph(), edge_attr='weight')

# Print the number of nodes and edges in the graph
print("Number of nodes:", G_phase1.number_of_nodes())
print("Number of edges:", G_phase1.number_of_edges())

# degree_centrality_phase1 = nx.degree_centrality(G_phase1)
# closeness_centrality_phase1 = nx.closeness_centrality(G_phase1)
# betweenness_centrality_phase1 = nx.betweenness_centrality(G_phase1)
eigenvector_centrality_phase1 = nx.eigenvector_centrality(G_phase1)

In [None]:
# Create an empty graph
G_phase2 = nx.Graph()

# Add nodes to the graph
for author in df_all_combined_networkx_nodes_phase2.author:
    G_phase2.add_node(author)
    
# Add edges using pandas, to incorporate weights

G_phase2 = nx.from_pandas_edgelist(df_all_combined_networkx_edges_phase2, 'author', 'parent_author',
                            create_using=nx.Graph(), edge_attr='weight')

# Print the number of nodes and edges in the graph
print("Number of nodes:", G_phase2.number_of_nodes())
print("Number of edges:", G_phase2.number_of_edges())

# degree_centrality_phase2 = nx.degree_centrality(G_phase2)
# closeness_centrality_phase2 = nx.closeness_centrality(G_phase2)
# betweenness_centrality_phase2 = nx.betweenness_centrality(G_phase2)
eigenvector_centrality_phase2 = nx.eigenvector_centrality(G_phase2)

In [None]:
# Create an empty graph
G_phase3 = nx.Graph()

# Add nodes to the graph
for author in df_all_combined_networkx_nodes_phase3.author:
    G_phase3.add_node(author)
    
# Add edges using pandas, to incorporate weights

G_phase3 = nx.from_pandas_edgelist(df_all_combined_networkx_edges_phase3, 'author', 'parent_author',
                            create_using=nx.Graph(), edge_attr='weight')

# Print the number of nodes and edges in the graph
print("Number of nodes:", G_phase3.number_of_nodes())
print("Number of edges:", G_phase3.number_of_edges())

# degree_centrality_phase3 = nx.degree_centrality(G_phase3)
# closeness_centrality_phase3 = nx.closeness_centrality(G_phase3)
# betweenness_centrality_phase3 = nx.betweenness_centrality(G_phase3)
eigenvector_centrality_phase3 = nx.eigenvector_centrality(G_phase3)

In [None]:
# Create an empty graph
G_phase4 = nx.Graph()

# Add nodes to the graph
for author in df_all_combined_networkx_nodes_phase4.author:
    G_phase4.add_node(author)
    
# Add edges using pandas, to incorporate weights

G_phase4 = nx.from_pandas_edgelist(df_all_combined_networkx_edges_phase4, 'author', 'parent_author',
                            create_using=nx.Graph(), edge_attr='weight')

# Print the number of nodes and edges in the graph
print("Number of nodes:", G_phase4.number_of_nodes())
print("Number of edges:", G_phase4.number_of_edges())

# degree_centrality_phase4 = nx.degree_centrality(G_phase4)
# closeness_centrality_phase4 = nx.closeness_centrality(G_phase4)
# betweenness_centrality_phase4 = nx.betweenness_centrality(G_phase4)
eigenvector_centrality_phase4 = nx.eigenvector_centrality(G_phase4)

In [None]:
# Create an empty graph
G_phase5 = nx.Graph()

# Add nodes to the graph
for author in df_all_combined_networkx_nodes_phase5.author:
    G_phase5.add_node(author)
    
# Add edges using pandas, to incorporate weights

G_phase5 = nx.from_pandas_edgelist(df_all_combined_networkx_edges_phase5, 'author', 'parent_author',
                            create_using=nx.Graph(), edge_attr='weight')

# Print the number of nodes and edges in the graph
print("Number of nodes:", G_phase5.number_of_nodes())
print("Number of edges:", G_phase5.number_of_edges())

# degree_centrality_phase5 = nx.degree_centrality(G_phase5)
# closeness_centrality_phase5 = nx.closeness_centrality(G_phase5)
# betweenness_centrality_phase5 = nx.betweenness_centrality(G_phase5)
eigenvector_centrality_phase5 = nx.eigenvector_centrality(G_phase5)

### Get Nodes with High EigenCentrality

In [None]:
sorted_eigenvector_centrality_phase1 = sorted(eigenvector_centrality_phase1.items(), key=lambda x: x[1], reverse=True)
sorted_eigenvector_centrality_phase2 = sorted(eigenvector_centrality_phase2.items(), key=lambda x: x[1], reverse=True)
sorted_eigenvector_centrality_phase3 = sorted(eigenvector_centrality_phase3.items(), key=lambda x: x[1], reverse=True)
sorted_eigenvector_centrality_phase4 = sorted(eigenvector_centrality_phase4.items(), key=lambda x: x[1], reverse=True)
sorted_eigenvector_centrality_phase5 = sorted(eigenvector_centrality_phase5.items(), key=lambda x: x[1], reverse=True)

#### Total

In [None]:
print("Top 5 nodes based on eigencentrality, total:")
for node, centrality in sorted_eigenvector_centrality[:5]:
    print("User '" + node + "' posted " + str(len(df_all_combined[df_all_combined.author==node])) + 
          " times, with eigencentrality score of: " + str(centrality))


In [None]:
for node, centrality in sorted_eigenvector_centrality[:5]:
    print("All submissions/comments posted by user '" + node + "' has net sentiment value of: " 
          + f"{np.average(df_all_combined[df_all_combined.author==node].pos - df_all_combined[df_all_combined.author==node].neg):.1%}")


#### LDA

In [None]:
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from gensim.models.coherencemodel import CoherenceModel

In [None]:
def get_num_topics(bow_corpus, dictionary):
    coherence_dict={}
    for num_topics in range(1,11):
    
        lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=10)

        coherence_model = CoherenceModel(model=lda_model, texts=tokenized_corpus, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()

        # print the coherence score
        coherence_dict[num_topics] = coherence_score
        
    best_num_topics = max(coherence_dict, key=coherence_dict.get)
    return best_num_topics

In [None]:
def lda_by_user(df_all_combined, node):
    # Sample text corpus
    corpus = df_all_combined[df_all_combined.author==node].text_string_lem

    # Preprocessing the corpus
    tokenized_corpus = [doc.lower().split() for doc in corpus]


    # Replace TerraLuna related words in corpus with a single word/category
    original_list = tokenized_corpus
    replacement_value = "terraluna"
    to_be_replaced = ['luna', 'terra', 'ust', 'classic', 'lunc', 'anchor']

    new_list=[]

    for nest_list in original_list:
        new_nest_list=[]
        for _ in nest_list:
            if _ in to_be_replaced:
                new_nest_list.append(replacement_value)
            else: 
                new_nest_list.append(_)
        new_list.append(new_nest_list)

    tokenized_corpus = new_list

    # Delete words
    original_list = tokenized_corpus

    to_be_deleted = ['amp', 'com', 'x200b', 'www', '000', '100', 'd0aqga2jlmypxcg', 'swjsh8']

    new_list=[]

    for nest_list in original_list:
        new_nest_list=[]
        for _ in nest_list:
            if _ in to_be_deleted:
                pass
            else: 
                new_nest_list.append(_)
        new_list.append(new_nest_list)

    tokenized_corpus = new_list

    # Create a dictionary from the tokenized corpus
    dictionary = corpora.Dictionary(tokenized_corpus)

    # Convert the tokenized corpus to a bag-of-words representation
    bow_corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_corpus]

    # Use no of topics as per trained 
    lda_model = LdaModel(bow_corpus, num_topics=get_num_topics(bow_corpus, dictionary), id2word=dictionary, passes=10)

    print("User '" + node + "' posted " + str(len(df_all_combined[df_all_combined.author==node])) + 
          " times, with eigencentrality score of: " + str(centrality) + "\n")
    
    print("All submissions/comments posted by user '" + node + "' has net sentiment value of: " 
          + f"{np.average(df_all_combined[df_all_combined.author==node].pos - df_all_combined[df_all_combined.author==node].neg):.1%}" + "\n")

    print(f"For user " + node + " here are the "+ str(get_num_topics(bow_corpus, dictionary)) +" topics of the submissions/comments posted by them:\n")
    
    # Print the topics and their corresponding words
    for topic_id in range(num_topics_final):
        print(f"Topic {topic_id + 1}:")
        words = lda_model.show_topic(topic_id)
        for word, prob in words:
            print(f"{word}: {prob}")
        print()

In [None]:
for node, centrality in sorted_eigenvector_centrality[:5]:
    lda_by_user(df_all_combined, node)

#### Top Users by Phases

In [None]:
for node, centrality in sorted_eigenvector_centrality_phase1[:5]:
    lda_by_user(df_all_combined_phase1, node)

In [None]:
for node, centrality in sorted_eigenvector_centrality_phase2[:5]:
    lda_by_user(df_all_combined_phase2, node)

In [None]:
for node, centrality in sorted_eigenvector_centrality_phase3[:5]:
    lda_by_user(df_all_combined_phase3, node)

In [None]:
for node, centrality in sorted_eigenvector_centrality_phase4[:5]:
    lda_by_user(df_all_combined_phase4, node)

In [None]:
for node, centrality in sorted_eigenvector_centrality_phase5[:5]:
    lda_by_user(df_all_combined_phase5, node)