**Imports**

In [16]:
from influencers import top_100_crypto_influencers
import pandas as pd
import numpy as np
import yfinance as yf

**Load the sentiment.csv and show every user from twitter, who is in top 100 of most influential personas in twitter, which has tweeted in the time period of the study**

In [17]:
# Read the sentiment-annotated tweets dataset from the CSV file
df_sentiment = pd.read_csv('datasets/sentiment_tweets.csv')

# Initialize an empty list to store data for each influencer
filtered_data = []

# Iterate through a list of top 100 crypto influencers (assuming 'top_100_crypto_influencers' is predefined)
for influencer in top_100_crypto_influencers:
    # Check if the influencer's username exists in the 'user_name' column of the dataframe
    if influencer in df_sentiment['user_name'].values:
        # Filter the data for this specific influencer and store it in a new dataframe
        influencer_data = df_sentiment[df_sentiment['user_name'] == influencer].copy()
        # Append the influencer's data to the list
        filtered_data.append(influencer_data)

# Concatenate all the filtered data into one dataframe, if there is any data to merge
# The 'ignore_index=True' will reset the index in the concatenated dataframe
influencers_df = pd.concat(filtered_data, ignore_index=True) if filtered_data else pd.DataFrame()

# Display the filtered dataframe containing only the tweets from the top 100 influencers
influencers_df


Unnamed: 0,user_name,user_created,user_followers,date,clean_text,sentiment
0,Crypto_Ed_NL,2017-08-04 18:26:44,94463,2021-02-10 07:43:55,BTC Interesting.... The 1st break out was Elon...,Neutral
1,CryptoKea,2016-01-31 20:28:26,5822,2021-02-15 21:51:40,The derivative market is currently another Bit...,Neutral


In [18]:
# Calculate the minimum and maximum number of followers in the 'user_followers' column
min_followers = df_sentiment['user_followers'].min()
max_followers = df_sentiment['user_followers'].max()

# Define a small epsilon value to avoid division by zero in normalization
epsilon = 1e-6

# Define a function to normalize the number of followers to a scale between 1 and 10
# The formula is: normalized_value = 1 + (9 * (followers - min) / (max - min))
# This ensures the result is scaled to a range of [1, 10], where 1 corresponds to the minimum followers and 10 corresponds to the maximum.
def normalize_followers(followers):
    return 1 + 9 * ((followers - min_followers) / (max_followers - min_followers + epsilon))


**Rewarding people's weight based on number of followers and those from top 100 are rewarded 10 times**

In [19]:
# Create a dictionary to map sentiment labels ('Positive', 'Neutral', 'Negative') to numerical values
sentiment_score = {'Positive': 1, 'Neutral': 0, 'Negative': -1}

# Map the sentiment labels in the 'sentiment' column to numerical values and store in a new 'sentiment_score' column
df_sentiment['sentiment_score'] = df_sentiment['sentiment'].map(sentiment_score)

# Assign weights based on whether the user is an influencer or not
# If the user is in the 'influencers_df', assign a weight of 10, otherwise normalize followers count for weight
df_sentiment['weight'] = df_sentiment.apply(
    lambda row: 10 if row['user_name'] in influencers_df['user_name'].values 
    else normalize_followers(row['user_followers']), axis=1
)

# Ensure the 'date' column is in datetime format, so it's properly handled in grouping and plotting
df_sentiment['date'] = pd.to_datetime(df_sentiment['date'])

# Group by the date and calculate the weighted average sentiment for each day
# np.average() is used to calculate the average sentiment, weighted by the 'weight' column
daily_sentiment = df_sentiment.groupby(df_sentiment['date'].dt.date).apply(
    lambda x: np.average(x['sentiment_score'], weights=x['weight'])
).reset_index(name='daily_sentiment')

# Convert the 'date' column back to datetime format for proper handling and merging with other datasets
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])


**BTC dataset download and display columns we need**

In [20]:
# Download Bitcoin (BTC) data from Yahoo Finance between specific start and end dates
# We focus on the 'Open' and 'Close' prices
btc = yf.download("BTC-USD", start="2021-02-05", end="2021-03-13")

# Select only 'Open' and 'Close' columns and reset the index to get a flat DataFrame
btc = btc[['Open', 'Close']].reset_index()

# Rename columns for easier understanding: 'Open' to 'btc_open' and 'Close' to 'btc_close'
btc.columns = ['date', 'btc_open', 'btc_close']

# Convert the 'date' column to datetime format to ensure compatibility with sentiment data (which is also in datetime format)
btc['date'] = pd.to_datetime(btc['date'])

# Display the resulting BTC price data
btc

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,date,btc_open,btc_close
0,2021-02-05,36931.546875,38144.308594
1,2021-02-06,38138.386719,39266.011719
2,2021-02-07,39250.191406,38903.441406
3,2021-02-08,38886.828125,46196.464844
4,2021-02-09,46184.992188,46481.105469
5,2021-02-10,46469.761719,44918.183594
6,2021-02-11,44898.710938,47909.332031
7,2021-02-12,47877.035156,47504.851562
8,2021-02-13,47491.203125,47105.515625
9,2021-02-14,47114.507812,48717.289062


**Merging btc and daily_sentiment dfs on the common column and calculating price change**

In [21]:
# Reset the index of daily_sentiment to ensure compatibility for merging
daily_sentiment = daily_sentiment.reset_index(drop=True)

# Merge btc and daily_sentiment on the 'date' column
merged_df = pd.merge(btc, daily_sentiment, on='date', how='inner')
merged_df

Unnamed: 0,date,btc_open,btc_close,daily_sentiment
0,2021-02-05,36931.546875,38144.308594,0.08847
1,2021-02-06,38138.386719,39266.011719,0.144457
2,2021-02-07,39250.191406,38903.441406,0.103381
3,2021-02-08,38886.828125,46196.464844,0.129575
4,2021-02-09,46184.992188,46481.105469,0.12386
5,2021-02-10,46469.761719,44918.183594,0.069633
6,2021-02-13,47491.203125,47105.515625,0.023385
7,2021-02-14,47114.507812,48717.289062,0.099805
8,2021-02-15,48696.535156,47945.058594,0.107509
9,2021-02-18,52140.972656,51679.796875,0.113906


In [22]:
# Calculate the price change for each day by subtracting the opening price from the closing price
# This gives the difference between the opening and closing price for Bitcoin (BTC) for each day
merged_df['price_change'] = merged_df['btc_close'] - merged_df['btc_open']


**Calculating correlation between daily sentiment and btc price change. This is only correlation, it does not indicate or predict changes. We try to find whether people's mood on BTC is connected to BTC price change short term.**

In [23]:
# Calculate the correlation between daily sentiment and BTC price change
# The .corr() method computes the Pearson correlation coefficient between the two columns
correlation = merged_df['daily_sentiment'].corr(merged_df['price_change'])

# Print the calculated correlation value
# The correlation will indicate how strongly daily sentiment is related to the price change of Bitcoin
print(f"Correlation between daily sentiment and BTC price change: {correlation}")


Correlation between daily sentiment and BTC price change: 0.5807165360402248
