# Ethereum

# Data collection 

In [1]:
import zstandard as zstd
import shutil
import os

def decompress_zst(input_file, output_file, max_samples):
    with open(input_file, 'rb') as compressed_file:
        decomp = zstd.ZstdDecompressor()
        with open(output_file, 'wb') as output:
            total_samples = 0
            for chunk in decomp.read_to_iter(compressed_file):
                output.write(chunk)
                total_samples += chunk.count(b'\n')
                if total_samples >= max_samples:
                    break

# List of input files
input_files = ['/Users/shivanipaunikar/Downloads/Project- crypto/EC.zst', '/Users/shivanipaunikar/Downloads/Project- crypto/ES.zst']

# Output directory where decompressed files will be stored
output_directory = '/Users/shivanipaunikar/Downloads/Project- crypto/output'

# Maximum number of samples you want to use
max_samples = 5000

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Loop through each input file and decompress
for input_file in input_files:
    # Generate the output file path by removing the .zst extension and adding .json
    output_file = os.path.join(output_directory, os.path.splitext(os.path.basename(input_file))[0] + '.json')
    
    print(f"Processing: {input_file}")
    
    # Decompress the input file, limiting to max_samples, and save it to the output file path
    decompress_zst(input_file, output_file, max_samples)
    
    print(f"Completed: {input_file} -> {output_file}")

print("All files processed.")


Processing: /Users/shivanipaunikar/Downloads/Project- crypto/EC.zst
Completed: /Users/shivanipaunikar/Downloads/Project- crypto/EC.zst -> /Users/shivanipaunikar/Downloads/Project- crypto/output/EC.json
Processing: /Users/shivanipaunikar/Downloads/Project- crypto/ES.zst
Completed: /Users/shivanipaunikar/Downloads/Project- crypto/ES.zst -> /Users/shivanipaunikar/Downloads/Project- crypto/output/ES.json
All files processed.


In [2]:
import pandas as pd
import json
import os

# Define the output directory where the JSON files were decompressed
output_directory = '/Users/shivanipaunikar/Downloads/Project- crypto/output'

# Function to fix JSON syntax errors
def fix_json_syntax(input_file, output_file):
    with open(input_file, 'r') as input_json, open(output_file, 'w') as output_json:
        for line in input_json:
            try:
                json_obj = json.loads(line)
                corrected_line = json.dumps(json_obj) + '\n'
                output_json.write(corrected_line)
            except json.JSONDecodeError as e:
                # Handle JSON syntax errors here or simply skip problematic lines
                print(f"Skipping line due to JSON syntax error: {line.strip()}")
                continue

# Specify your input and output file paths for fixing 'ES.json' syntax
input_file_path_bs = os.path.join(output_directory, 'ES.json')
output_file_path_bs = os.path.join(output_directory, 'ES_fixed.json')

# Call the function to fix JSON syntax errors in 'Bs.json'
fix_json_syntax(input_file_path_bs, output_file_path_bs)

# Specify your input and output file paths for fixing 'Bc.json' syntax
input_file_path_bc = os.path.join(output_directory, 'EC.json')
output_file_path_bc = os.path.join(output_directory, 'EC_fixed.json')

# Call the function to fix JSON syntax errors in 'Bc.json'
fix_json_syntax(input_file_path_bc, output_file_path_bc)

# Define a function to read JSON data line by line
def read_json_lines(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            yield json.loads(line)

# Use os.path.join to construct file paths
submission_data = list(read_json_lines(output_file_path_bs))  
comment_data = list(read_json_lines(output_file_path_bc)) 

# Create DataFrames
submission_df = pd.DataFrame(submission_data)
comment_df = pd.DataFrame(comment_data)

# Filter for Bitcoin-related data (you may need to adjust the filter)
Ethereum_submissions = submission_df[submission_df['title'].str.contains('Ethereum', case=False, na=False)]
Ethereum_comments = comment_df[comment_df['body'].str.contains('Ethereum', case=False, na=False)]

# Sample 5000 observations with replacement
Ethereum_submissions_sample = Ethereum_submissions.sample(n=5000, replace=True)
Ethereum_comments_sample = Ethereum_comments.sample(n=5000, replace=True)

# Export to CSV
Ethereum_submissions_sample.to_csv('Ethereum_submissions_sample.csv', index=False)
Ethereum_comments_sample.to_csv('Ethereum_comments_sample.csv', index=False)


Skipping line due to JSON syntax error: {"link_flair_text":null,"thumbnail":"default","quarantine":false,"downs":0,"num_comments":22,"domain":"otlw.co","distinguished":null,"secure_media":null,"title":"Building a decentralized educational system","id":"3jq91b","is_self":false,"hide_score":false,"author_flair_css_class":null,"subreddit":"ethereum","media_embed":{},"media":null,"edited":false,"created":1441476416,"created_utc":"1441447616","from_id":null,"s
Skipping line due to JSON syntax error: {"author":"mrkellis","created_utc":"1400540199","removal_reason":null,"distinguished":null,"subreddit":"ethereum","link_id":"t3_25y4le","downs":0,"retrieved_on":1433868840,"edited":false,"body":"&gt; All traffic between Ethereum nodes is encrypted via a public key\n\nIs the key being rotated PFS style? Can Ethereum use something like [Noise](https://github.com/trevp


# Summary stats 

In [3]:
# Load the CSV files
Ethereum_submissions_df = pd.read_csv('Ethereum_submissions_sample.csv')
Ethereum_comments_df = pd.read_csv('Ethereum_comments_sample.csv')


In [4]:
# Display the columns of the DataFrame
print("Columns in Ethereum_submissions_df:")
print(Ethereum_submissions_df.columns)

# Display a sample of the DataFrame
print("Sample data in Ethereum_submissions_df:")
print(Ethereum_submissions_df.head())

# Repeat the same for Ethereum_comments_df if needed
print("Columns in Ethereum_comments_df:")
print(Ethereum_comments_df.columns)

print("Sample data in Ethereum_comments_df:")
print(Ethereum_comments_df.head())


Columns in Ethereum_submissions_df:
Index(['quarantine', 'edited', 'score', 'stickied', 'subreddit_id', 'created',
       'from_kind', 'secure_media', 'url', 'from_id', 'is_self', 'author',
       'from', 'id', 'archived', 'name', 'link_flair_css_class', 'gilded',
       'over_18', 'saved', 'author_flair_css_class', 'num_comments',
       'retrieved_on', 'ups', 'media', 'selftext', 'permalink',
       'link_flair_text', 'secure_media_embed', 'title', 'distinguished',
       'thumbnail', 'downs', 'hide_score', 'domain', 'subreddit',
       'media_embed', 'author_flair_text', 'created_utc', 'post_hint',
       'preview', 'locked'],
      dtype='object')
Sample data in Ethereum_submissions_df:
   quarantine edited  score  stickied subreddit_id     created  from_kind  \
0       False  False     22     False     t5_2zf9m  1437513314        NaN   
1       False  False      6     False     t5_2zf9m  1424877407        NaN   
2       False  False      3     False     t5_2zf9m  1438186037       

In [5]:
import pandas as pd

# Load the data into DataFrames (assuming you already have them loaded)
Ethereum_submissions_df = pd.read_csv('Ethereum_submissions_sample.csv')
Ethereum_comments_df = pd.read_csv('Ethereum_comments_sample.csv')

# Convert 'created_utc' columns to datetime objects
Ethereum_submissions_df['created_utc'] = pd.to_datetime(Ethereum_submissions_df['created_utc'], unit='s')
Ethereum_comments_df['created_utc'] = pd.to_datetime(Ethereum_comments_df['created_utc'], unit='s')

# Calculate the range of dates available
date_range_submissions = Ethereum_submissions_df['created_utc'].min(), Ethereum_submissions_df['created_utc'].max()
date_range_comments = Ethereum_comments_df['created_utc'].min(), Ethereum_comments_df['created_utc'].max()

print("Range of dates available in Ethereum_submissions_df:", date_range_submissions)
print("Range of dates available in Ethereum_comments_df:", date_range_comments)

# Calculate the number of posts, comments, and authors for each day
posts_per_day_submissions = Ethereum_submissions_df.groupby(Ethereum_submissions_df['created_utc'].dt.date).size()
comments_per_day = Ethereum_comments_df.groupby(Ethereum_comments_df['created_utc'].dt.date).size()
unique_authors_per_day_submissions = Ethereum_submissions_df.groupby(Ethereum_submissions_df['created_utc'].dt.date)['author'].nunique()

# Display the results
print("\nNumber of posts per day in Ethereum_submissions_df:")
print(posts_per_day_submissions)
print("\nNumber of comments per day in Ethereum_comments_df:")
print(comments_per_day)
print("\nNumber of unique authors per day in Ethereum_submissions_df:")
print(unique_authors_per_day_submissions)


Range of dates available in Ethereum_submissions_df: (Timestamp('2014-01-04 01:42:51'), Timestamp('2015-09-04 17:38:42'))
Range of dates available in Ethereum_comments_df: (Timestamp('2014-01-05 19:26:46'), Timestamp('2014-05-19 17:59:11'))

Number of posts per day in Ethereum_submissions_df:
created_utc
2014-01-04     1
2014-01-11    10
2014-01-12    12
2014-01-13     2
2014-01-14    23
              ..
2015-08-31    22
2015-09-01    17
2015-09-02    29
2015-09-03     7
2015-09-04    27
Length: 505, dtype: int64

Number of comments per day in Ethereum_comments_df:
created_utc
2014-01-05     4
2014-01-10     9
2014-01-11    23
2014-01-12    20
2014-01-13    35
              ..
2014-05-15    21
2014-05-16    16
2014-05-17    11
2014-05-18    21
2014-05-19    35
Length: 130, dtype: int64

Number of unique authors per day in Ethereum_submissions_df:
created_utc
2014-01-04     1
2014-01-11     2
2014-01-12     5
2014-01-13     1
2014-01-14     4
              ..
2015-08-31     7
2015-09-01

# Topic Modeling

In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import models
import gensim.corpora as corpora
import gensim

In [7]:
# Load the data into DataFrames (assuming you already have them loaded)
Ethereum_submissions_df = pd.read_csv('Ethereum_submissions_sample.csv')
Ethereum_comments_df = pd.read_csv('Ethereum_comments_sample.csv')

# Convert 'created_utc' columns to datetime objects
Ethereum_submissions_df['created_utc'] = pd.to_datetime(Ethereum_submissions_df['created_utc'], unit='s')
Ethereum_comments_df['created_utc'] = pd.to_datetime(Ethereum_comments_df['created_utc'], unit='s')

# Combine comments and submissions text
Ethereum_text = Ethereum_submissions_df['selftext'].dropna().tolist() + Ethereum_comments_df['body'].dropna().tolist()


In [None]:
# Text preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

Ethereum_text = [preprocess_text(text) for text in Ethereum_text]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivanipaunikar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shivanipaunikar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)


In [None]:
# Apply TF-IDF vectorization
Ethereum_tfidf = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in Ethereum_text])

In [None]:
pip install gensim matplotlib pyLDAvis


In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt

# Assuming you have a list of preprocessed tokens called 'bitcoin_text'
# You can replace 'bitcoin_text' with your actual data
dictionary = Dictionary([tokens for tokens in bitcoin_text])
corpus = [dictionary.doc2bow(tokens) for tokens in bitcoin_text]

# Perform LDA topic modeling
lda_model_bitcoin = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=2)


In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Assuming you have a list of preprocessed tokens called 'Ethereum_text'
# You can replace 'Ethereum_text' with your actual data
dictionary = Dictionary([tokens for tokens in Ethereum_text])
corpus = [dictionary.doc2bow(tokens) for tokens in Ethereum_text]

# Perform LDA topic modeling
lda_model_Ethereum = LdaModel(corpus=corpus, id2word=dictionary, num_topics=12, passes=2)


In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

# Assuming you have a list of preprocessed tokens called 'Ethereum_text'
# You can replace 'Ethereum_text' with your actual data
dictionary = Dictionary([tokens for tokens in Ethereum_text])
corpus = [dictionary.doc2bow(tokens) for tokens in Ethereum_text]

# Initialize lists to store coherence scores for different topic numbers
coherence_scores = []
num_topics_list = range(2, 21)  # You can adjust the range of topic numbers

# Iterate through different numbers of topics
for num_topics in num_topics_list:
    lda_model_Ethereum = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=2)
    
    # Calculate the coherence score
    coherence_model = CoherenceModel(model=lda_model_Ethereum, texts=Ethereum_text, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    coherence_scores.append(coherence_score)

# Plot the coherence scores
plt.figure(figsize=(10, 6))
plt.plot(num_topics_list, coherence_scores, marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Score vs. Number of Topics")
plt.grid(True)
plt.show()


In [None]:
# Print the topics generated by the LDA model
topics = lda_model_Ethereum.print_topics(num_words=12)  # You can adjust the number of words per topic
for topic in topics:
    print(topic)


# Sentiment Analysis


In [None]:
from textblob import TextBlob

# Combine Ethereum submissions and comments text into a single list
Ethereum_text = Ethereum_submissions_df['selftext'].dropna().tolist() + Ethereum_comments_df['body'].dropna().tolist()

# Preprocess the text data (if you haven't done this already)
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)  # Join tokens into a single string

Ethereum_text = [preprocess_text(text) for text in Ethereum_text]


In [None]:
# Define a function to get sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity

    if polarity > 0.1:
        return "positive"
    elif polarity < -0.1:
        return "negative"
    else:
        return "neutral"


In [None]:
# Apply sentiment analysis to your preprocessed text data
sentiments = [get_sentiment(text) for text in Ethereum_text]


In [None]:
# Create a DataFrame to store sentiments (optional)
sentiments_df = pd.DataFrame({'Sentiment': sentiments})

In [None]:
# Print or analyze the sentiment results as needed
print(sentiments_df['Sentiment'].value_counts())  # Display sentiment counts


In [None]:
# Print the first few comments with their sentiments
for i, comment in enumerate(Ethereum_comments_df['body'].dropna()):
    if i >= 5:  # Print the first 5 comments
        break
    print(f"Comment {i + 1}:")
    print("Text:", comment)
    print("Sentiment:", sentiments[i])
    print()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have already performed sentiment analysis and have a DataFrame 'sentiments_df'
# with a 'Sentiment' column containing sentiment labels (positive, negative, neutral)

# Count the occurrences of each sentiment label
sentiment_counts = sentiments_df['Sentiment'].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'], alpha=0.7)
plt.title('Sentiment Analysis of Ethereum Comments')
plt.xlabel('Sentiment')
plt.ylabel('Count')

# Annotate the bars with counts
for i, count in enumerate(sentiment_counts):
    plt.text(i, count, str(count), ha='center', va='bottom', fontsize=12)

# Show the plot
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


# Emotion Analysis


In [None]:
pip install transformers torch


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "michellejieli/emotion_text_classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [None]:
import torch

def get_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    emotion_labels = ["anger", "fear", "joy", "sadness", "surprise"]
    return emotion_labels[predicted_class]


In [None]:
Ethereum_comments_df['emotion'] = Ethereum_comments_df['cleaned_text'].apply(get_emotion)


In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each emotion label
emotion_counts = Ethereum_comments_df['emotion'].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 6))
emotion_counts.plot(kind='bar', color=['red', 'blue', 'green', 'purple', 'orange'], alpha=0.7)
plt.title('Emotional Analysis of Ethereum Comments')
plt.xlabel('Emotion')
plt.ylabel('Count')

# Annotate the bars with counts
for i, count in enumerate(emotion_counts):
    plt.text(i, count, str(count), ha='center', va='bottom', fontsize=12)

# Show the plot
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
