# Introduction

Welcome to my Python Notebook where I am exploring the fascinating world of cryptocurrency sentiment analysis, focusing particularly on Twitter.

Cryptocurrencies, in their volatile and dynamic nature, often exhibit price fluctuations influenced by various factors. One key influencer I am exploring is Twitter, a social media platform where real-time discussions about cryptoassets are abundant.

In this project, I will be pulling tweets about different cryptocurrencies using the Twitter API and applying sentiment analysis to these tweets. The objective here is straightforward - to analyze the sentiment embedded in tweets related to specific cryptocurrencies.

Join me as we dive into this intriguing mix of Python, cryptocurrencies, Twitter, and sentiment analysis. Let's see what the tweets say!

# Import the credentials

A fellow student has to present this project, therefore I implement a function to upload the credentials in a JSON-Format and save them in variables for later use. She or he can request the JSON-File by e-mail from pfeifsas@

This process ensures the secure and organized handling of project credentials.

In [None]:
from google.colab import files
import io
import json

# Use files.upload to produce the "Choose Files" button below, then select your file.
uploaded = files.upload()

# Use io.BytesIO to decode the file, then json.load to open it.
file = io.BytesIO(uploaded['credentials.json'])
credentials = json.load(file)

# Use Python list comprehension to save each credential to a separate variable.
TWITTER_CONSUMER_KEY = credentials['TWITTER_CONSUMER_KEY']
TWITTER_CONSUMER_SECRET = credentials['TWITTER_CONSUMER_SECRET']
TWITTER_ACCESS_TOKEN = credentials['TWITTER_ACCESS_TOKEN']
TWITTER_ACCESS_TOKEN_SECRET = credentials['TWITTER_ACCESS_TOKEN_SECRET']
BEARER_TOKEN = credentials['BEARER_TOKEN']
GPT_SECRET_KEY = credentials['GPT_SECRET_KEY']
MONGO_CONNECTION_STRING = credentials['MONGO_CONNECTION_STRING']


# Install and import all the needed libraries and dependencies

With this code all the needed libraries and dependencies are getting installed.

Install die Libraries

In [None]:
!pip install datasets
!pip install transformers
!pip install openai
!pip install 'pymongo[srv]'

Import the dependencies

In [None]:
# nltk
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

# utilities
import re
import pickle
import numpy as np
import pandas as pd
import time
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn import functional as F
import openai
from pymongo.mongo_client import MongoClient
import requests
from datetime import datetime
import matplotlib.pyplot as plt
from collections import defaultdict


Instead of downloading and importing a CSV I'm using the datasets library to load the initial test-data-set. After loading it, I transform it into a pandas data-set

In [None]:
# Load the dataset
dataset = load_dataset("sentiment140")

# Access a split and convert to a pandas dataframe
df = dataset['train'].to_pandas()

# Removing the unnecessary columns.
df = df[['sentiment','text']]

# Replacing the values to ease understanding.
df['sentiment'] = df['sentiment'].replace(4,1)

# Plotting the distribution for dataset.
ax = df.groupby('sentiment').count().plot(kind='bar', title='Distribution of data',
                                               legend=False)
ax.set_xticklabels(['Negative','Positive'], rotation=0)

# Storing data in lists.
text, sentiment = list(df['text']), list(df['sentiment'])

In [None]:
df['sentiment'].value_counts()

# Defining a List of Stop-Words
--- 

In [None]:
## Defining set containing all stopwords in english. 

stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

# Defining a preprocessing function
---
The following function is used to preprocess all tweets in preparation for sentiment analysis:

In [None]:
def preprocess(textdata):
    processedText = []

    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()

    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    emoticonPattern   = r"[:;=8][\-o\*\']?[\)\]\(\[dDpP/\\OpP3]"

    for tweet in textdata:
        if tweet is not None:   # skip None values
            tweet = tweet.lower()

            # Replace all URls with 'URL'
            tweet = re.sub(urlPattern,' URL',tweet)       
            # Replace @USERNAME to 'USER'.
            tweet = re.sub(userPattern,' USER', tweet)        
            # Replace all non alphabets.
            tweet = re.sub(alphaPattern, " ", tweet)
            # Replace 3 or more consecutive letters by 2 letter.
            tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
            # Replace emoticons with an empty string
            tweet = re.sub(emoticonPattern, '', tweet)

            tweetwords = ''
            for word in tweet.split():
                # Checking if the word is a stopword.
                if word not in stopwordlist:
                    if len(word)>1:
                        # Lemmatizing the word.
                        word = wordLemm.lemmatize(word)
                        tweetwords += (word+' ')
                
            processedText.append(tweetwords)
        
    return processedText


In [None]:
t = time.time()
processedtext = preprocess(text)
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

# Creating word clouds
---

### Word-Cloud for negative tweets

In [None]:
data_neg = processedtext[:800000]
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_neg))
plt.imshow(wc)

## Word-Cloud for positive tweets

In [None]:
data_pos = processedtext[800000:]
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
              collocations=False).generate(" ".join(data_pos))
plt.figure(figsize = (20,20))
plt.imshow(wc)

## Train-Test-Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(processedtext, sentiment,
                                                    test_size = 0.05, random_state = 0)
print(f'Data Split done.')



## TF-IDF-Vectoriser

In [None]:
# Initialize the vectorizer
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)

# Fit it to the training data
vectoriser.fit(X_train)

print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

Transforming the data set

In [None]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

#function to evaluate different models
---
This funtion is used to evaluate the trained models

In [None]:
def model_Evaluate(model):
    
    # Predict values for Test dataset
    y_pred = model.predict(X_test)

    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    categories  = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

# Model Selection Strategy
---
I am adopting a dual approach for sentiment prediction. The first part of this strategy involves training a custom model, while the second part harnesses the capabilities of the GPT-API for sentiment analysis.

Considering the utilization of the GPT-API, I have decided to train a relatively simpler custom model. I'll be training and comparing the performance of three distinct models:

- Bernoulli Naive Bayes Model
- Linear Support Vector Machine Model
- Logistic Regression Model

Among these, the model that delivers the highest performance will be selected for further optimization. This optimized model will then be used to predict the sentiments of tweets, which I'll retrieve directly from the Twitter API.


## BernoulliNB Model

This is a probabilistic classifier that makes use of Bayes' Theorem with strong independence assumptions. It is particularly suitable for data that can be binary, like the presence or absence of a word in text.

In [None]:
BNBmodel = BernoulliNB(alpha = 2)
BNBmodel.fit(X_train, y_train)
model_Evaluate(BNBmodel)

The BernoulliNB-Model has an accuracy of 80%. Let's compare it to a linear support vector machine model.

## Linear SVM Model

This is a maximum-margin classifier which works by constructing a hyperplane or a set of hyperplanes in a high or infinite dimensional space, making it a robust model for text classification tasks.

In [None]:
SVCmodel = LinearSVC()
SVCmodel.fit(X_train, y_train)
model_Evaluate(SVCmodel)

No Improvemnet with the linear support vector machine model. Let's have a look how the performance of a linear regression model looks like.

## Logistic Regression Model 

This is a statistical model that uses a logistic function to model a binary dependent variable. In the context of sentiment analysis, it predicts the probability of a particular sentiment (positive or negative) based on input features.

In [None]:
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel.fit(X_train, y_train)
model_Evaluate(LRmodel)

The linear regression model has an accuracy of 82%. It's the best performance of all 3 trained models. The performance is ok, but I'll try to improve the accuracy  now.

# Load Data from the Twitter-API
---
In this section, I am utilizing the Twitter API to fetch a specific number of tweets. To ensure the quality of the retrieved tweets, I specify a particular currency to be mentioned in the tweets while excluding commonly used spam-bot words. Additionally, I exclude mentions of "NFT" and "NFTs" as they are not relevant to this project focused on crypto-currencies. This approach helps filter out spam and ensure the retrieved tweets are relevant to the desired topic.

In [None]:
search_url = "https://api.twitter.com/2/tweets/search/recent"

# Hardcoded query and excluded words to avoid too many spam-tweets from bots
query = "Bitcoin"
excluded_words = ['airdrop', 'bot', 'retweet', 'retweeted', 'wallet', 'mint', 'ticket', 'drop', 'opensea', 'blur', 'NFT', 'NFTs', 'giveaway']

# Construct the excluded words portion of the query
excluded_query = ' '.join(f'-{word}' for word in excluded_words)

# Combine the query and excluded words
full_query = f'{query} {excluded_query}'

# Set query parameters
query_params = {'query': full_query, 'tweet.fields': 'author_id', 'max_results': 10}

def bearer_oauth(r):
    r.headers["Authorization"] = f"Bearer {BEARER_TOKEN}"
    r.headers["User-Agent"] = "v2FilteredStreamPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

json_response = connect_to_endpoint(search_url, query_params)

# Convert the response to a DataFrame
data = [{'id': tweet['id'], 'text': tweet['text'], 'query': query} for tweet in json_response['data']]
loaded_tweets = pd.DataFrame(data)

# Print the DataFrame
print(loaded_tweets)


# Language recognition
---

In previous versions of the Twitter API, developers had the capability to select the language of the tweets directly from the API. However, this functionality is no longer available. To overcome this limitation, I am utilizing a pre-trained language detection model to filter and retain only English tweets. This ensures that the tweets used for training and prediction align with the language-specific focus of my model, which is trained exclusively on English tweets.

In [None]:
#import the model
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")

model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

In [None]:
#define a function to get a predicted label for the text of a tweets
def predict_language(text):
    # tokenize the text
    inputs = tokenizer(text, return_tensors="pt")
    
    # run the text through the model and get the logits
    outputs = model(**inputs)
    logits = outputs.logits

    # compute the probabilities from the logits
    probabilities = F.softmax(logits, dim=1).detach().numpy()
    
    # get the label of the highest probability
    predicted_label = model.config.id2label[probabilities.argmax()]
    
    return predicted_label

# create a new column with the predicted language
loaded_tweets['language'] = loaded_tweets['text'].apply(predict_language)

# filter out rows that are not in English
english_tweets = loaded_tweets[loaded_tweets['language'] == 'en']


In [None]:
print(english_tweets)

# Get a Sentiment from the GPT-API for the english tweets
---
To obtain the sentiment for each English tweet, I retrieve them individually from the GPT-API. The dataset is then enhanced by adding the sentiment obtained from the API for each respective tweet. The sentiment prediction is encoded as 1 for positive sentiment and 0 for negative sentiment.

For the prediction task, I opt to use the "text-ada-0001" engine. This decision was primarily driven by financial considerations rather than quality considerations. While the "Davinci" model may be more powerful, its cost is approximately 50 times higher than that of the "text-ada-0001" model.

In [None]:
# Set up your OpenAI API credentials
openai.api_key = GPT_SECRET_KEY

# Set up empty lists to store tweet IDs, tweet texts, and predictions
tweet_ids = []
tweet_texts = []
predictions = []

# Iterate over each tweet in the loaded dataset
for index, row in english_tweets.iterrows():
    tweet_id = row['id']
    tweet_text = row['text']
    
    # Append tweet ID and text to the respective lists
    tweet_ids.append(tweet_id)
    tweet_texts.append(tweet_text)

    # Set up your OpenAI API request
    prompt = f"Analyze the sentiment of the following tweet: '{tweet_text}'. Answer with 'positive' or 'negative' depending on the sentiment, just one word in the answer"
    
    response = openai.Completion.create(
        engine="text-ada-001",
        prompt=prompt,
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.0
    )
    
    # Extract the predicted sentiment from the OpenAI API response
    sentiment = response.choices[0].text.strip()
    
    # Append the predicted sentiment to the list of predictions
    predictions.append(sentiment)
    
# Create a new DataFrame with tweet IDs, texts, and predictions
updated_data = pd.DataFrame({'id': tweet_ids, 'text': tweet_texts, 'query': english_tweets['query'], 'gpt_pred': predictions})

# Replace positive/negative in 'gpt_pred' column with 1/0 and '.' with 'N/A'
updated_data['gpt_pred'] = updated_data['gpt_pred'].apply(lambda x: 1 if 'positive' in str(x).lower() else (0 if 'negative' in str(x).lower() else 'N/A'))


In [None]:
print(updated_data)

# Get a prediction from the LR-Model 
---

I proceed to predict the sentiment of each tweet using my trained linear regression model. Subsequently, I enhance the dataset by incorporating the sentiment predictions obtained from the model for each respective tweet. This augmentation adds valuable sentiment information to the dataset for further analysis and evaluation.

In [None]:
# Preprocess the tweets
preprocessed_tweets = preprocess(tweet_texts)

# Transform the tweets to a numerical representation
X = vectoriser.transform(preprocessed_tweets)

# Make predictions
predictions = LRmodel.predict(X)

# Add the predictions to the dataframe
updated_data['lr_pred'] = predictions


In [None]:
print(updated_data)

# Save the Data-Frame to a Mongo-DB for later use
---
All the fetched tweets from the Twitter API, along with the corresponding sentiments obtained from both the GPT model and the linear regression (LR) model, are stored in a MongoDB database. This allows for easy retrieval and utilization of the tweet data and sentiment predictions from both models for subsequent analyses and comparisons.

In [None]:
# Create a new client and connect to the server
client = MongoClient(MONGO_CONNECTION_STRING)

db = client.ML2Project

# Assuming you want to store the DataFrame in a MongoDB collection named "mycollection"
collection = db.cryptotweetssentiment

# Add a timestamp column to the DataFrame with the current date
updated_data['upload_date'] = datetime.now().date().strftime('%Y-%m-%d')

# Convert the DataFrame to a list of dictionaries
data_dict = updated_data.to_dict("records")

# Insert documents into the collection
collection.insert_many(data_dict)


#Load and compare all saved tweets
---
Now I'm loading all saved tweets from the mongodb to analyse if the prediction from both models are the same or different. The result is visualised with matplotlib.

In [None]:
client = MongoClient(MONGO_CONNECTION_STRING)
db = client.ML2Project
collection = db.cryptotweetssentiment
queries = collection.distinct("query")

bar_width = 0.35

for query in queries:
    data = list(collection.find({"query": query}))
    df = pd.DataFrame(data)
    df['upload_date'] = pd.to_datetime(df['upload_date']).dt.date

    groups_gpt = df.groupby(['upload_date', 'gpt_pred'])
    dates_gpt = []
    sentiment_counts_gpt = defaultdict(lambda: {'Positive': 0, 'Negative': 0, 'N/A': 0})
    for (date, pred), group in groups_gpt:
        if date not in dates_gpt:  # Append date only if it's not already in the list
            dates_gpt.append(date)
        if pred == 1:
            sentiment_counts_gpt[date]['Positive'] = group.shape[0]
        elif pred == 0:
            sentiment_counts_gpt[date]['Negative'] = group.shape[0]
        else:
            sentiment_counts_gpt[date]['N/A'] = group.shape[0]

    groups_lr = df.groupby(['upload_date', 'lr_pred'])
    sentiment_counts_lr = defaultdict(lambda: {'Positive': 0, 'Negative': 0, 'N/A': 0})
    for (date, pred), group in groups_lr:
        if pred == 1:
            sentiment_counts_lr[date]['Positive'] = group.shape[0]
        elif pred == 0:
            sentiment_counts_lr[date]['Negative'] = group.shape[0]
        else:
            sentiment_counts_lr[date]['N/A'] = group.shape[0]

    fig, ax = plt.subplots()
    r1 = np.arange(len(dates_gpt))
    
    ax.bar(r1, [sentiment_counts_gpt[date]['Positive'] for date in dates_gpt], color='green', width=bar_width, label='Positive (GPT)')
    ax.bar(r1, [sentiment_counts_gpt[date]['Negative'] for date in dates_gpt], color='red', width=bar_width, bottom=[sentiment_counts_gpt[date]['Positive'] for date in dates_gpt], label='Negative (GPT)')
    ax.bar(r1, [sentiment_counts_gpt[date]['N/A'] for date in dates_gpt], color='gray', width=bar_width, bottom=[sentiment_counts_gpt[date]['Positive'] + sentiment_counts_gpt[date]['Negative'] for date in dates_gpt], label='N/A (GPT)')

    ax.bar(r1 + bar_width, [sentiment_counts_lr[date]['Positive'] for date in dates_gpt], color='lightgreen', width=bar_width, label='Positive (LR)')
    ax.bar(r1 + bar_width, [sentiment_counts_lr[date]['Negative'] for date in dates_gpt], color='salmon', width=bar_width, bottom=[sentiment_counts_lr[date]['Positive'] for date in dates_gpt], label='Negative (LR)')
    ax.bar(r1 + bar_width, [sentiment_counts_lr[date]['N/A'] for date in dates_gpt], color='lightgray', width=bar_width, bottom=[sentiment_counts_lr[date]['Positive'] + sentiment_counts_lr[date]['Negative'] for date in dates_gpt], label='N/A (LR)')

    plt.xticks(r1 + bar_width / 2, [str(date) for date in dates_gpt], rotation=45)
    ax.set_ylabel('Count')
    ax.set_title(f'Sentiment Distribution for {query}')
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, ncol=3)

    plt.show()
