# Trade with feelings
## Twitter sentiment analysis

In [8]:
# Import required libraries

import requests
import os
import json
from dotenv import load_dotenv
import time
import pandas as pd
import csv

#import tweepy
#from twitter_authentication import bearer_token

import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob

# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
    
# Global Parameters
stop_words = set(stopwords.words('english'))


In [9]:
# Load the environment variables from the .env file by calling the load_dotenv function
load_dotenv("twitter.env")

True

In [10]:
# Get the API key from the environment variable and store as Python variable
consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_token = os.getenv("ACCESS_TOKEN")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")
bearer_token = os.getenv("BEARER_TOKEN")

type(bearer_token)

NoneType

### Define funtions to reuse in the code

In [11]:
#search_url = "https://api.twitter.com/2/tweets/search/all"
search_url = "https://api.twitter.com/2/tweets/search/recent"

ticker_to_query = 'BTC'

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {'query': f'(#{ticker_to_query} -is:retweet)',
                'tweet.fields': 'created_at,public_metrics,text,author_id', 
                'max_results': '100',
               }

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FullArchiveSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

def preprocess_tweet_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    #ps = PorterStemmer()
    #stemmed_words = [ps.stem(w) for w in filtered_words]
    #lemmatizer = WordNetLemmatizer()
    #lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(filtered_words)

def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

def polarity_to_string(sentiment):
    if sentiment < 0:
        return "Negative"
    elif sentiment == 0:
        return "Neutral"
    else:
        return "Positive"

def get_sentiment(df, txt_col):
    return df[txt_col].map(lambda txt: TextBlob(txt).sentiment.polarity)


In [13]:
# Request data from Twitter
twitter_json_response = connect_to_endpoint(search_url, query_params)
#print(json.dumps(twitter_json_response, indent=4, sort_keys=True))

401


Exception: (401, '{\n  "title": "Unauthorized",\n  "type": "about:blank",\n  "status": 401,\n  "detail": "Unauthorized"\n}')

In [None]:
#type(twitter_json_response)
#type(twitter_json_response['data'])

#print(json.dumps(twitter_json_response['data'][0], indent=4, sort_keys=True))

In [7]:
#myFile = open('twitter_data.csv', 'w', errors='ignore')
#writer = csv.DictWriter(myFile, fieldnames=['text', 'id', 'author_id','created_at','public_metrics'])
#writer.writeheader()
#writer.writerows(twitter_json_response['data'])
#myFile.close()

#myFile = open('twitter_data.csv', 'r')
#print("The content of the csv file is:")
#print(myFile.read())
#myFile.close()




### Process the data

In [22]:
# Convert Json to DataFrame
#tweets_df = pd.read_json(json.dumps(twitter_json_response['data']))
tweets_df = pd.read_csv("twitter_data.csv", encoding='cp1252')
tweets_df.head()

Unnamed: 0,text,id,author_id,created_at,public_metrics
0,@superconnecter IKOLF #ikolf #bsc #bscgems #10...,1554949914882641920,1552242131632136192,2022-08-03T21:58:43.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
1,You ask me how #blockchain #btc #eth#eth #Sola...,1554949909241004032,1472449896405823488,2022-08-03T21:58:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
2,#Ethereum price update: \r\n\r\n#ETH $1644.74 ...,1554949908284710913,1551067188882116608,2022-08-03T21:58:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
3,@TheMoonCarl I wish to invest in #Crypto and I...,1554949901347332097,1338149444605386752,2022-08-03T21:58:40.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
4,@Janc_102 @Bitboy_Crypto The revolutionary tok...,1554949895328792576,1550565274801537024,2022-08-03T21:58:39.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."


In [23]:
# Preprocess data
tweets_df.text = tweets_df['text'].apply(preprocess_tweet_text)
tweets_df.head()

Unnamed: 0,text,id,author_id,created_at,public_metrics
0,IKOLF ikolf bsc bscgems 100x floki memecoin nf...,1554949914882641920,1552242131632136192,2022-08-03T21:58:43.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
1,You ask blockchain btc etheth Solana Web30 Gam...,1554949909241004032,1472449896405823488,2022-08-03T21:58:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
2,Ethereum price update ETH 164474 USD Bitcoin 0...,1554949908284710913,1551067188882116608,2022-08-03T21:58:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
3,I wish invest Crypto I believe life turn aroun...,1554949901347332097,1338149444605386752,2022-08-03T21:58:40.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
4,The revolutionary token provides holders PASSI...,1554949895328792576,1550565274801537024,2022-08-03T21:58:39.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."


In [24]:
# Get the sentiment of the full_text and label it
tweets_df['textblob_sentiment'] = get_sentiment(tweets_df, 'text')
tweets_df['sentiment_label'] = tweets_df['textblob_sentiment'].apply(polarity_to_string)

tweets_df.head()

Unnamed: 0,text,id,author_id,created_at,public_metrics,textblob_sentiment,sentiment_label
0,IKOLF ikolf bsc bscgems 100x floki memecoin nf...,1554949914882641920,1552242131632136192,2022-08-03T21:58:43.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",0.0,Neutral
1,You ask blockchain btc etheth Solana Web30 Gam...,1554949909241004032,1472449896405823488,2022-08-03T21:58:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",0.5,Positive
2,Ethereum price update ETH 164474 USD Bitcoin 0...,1554949908284710913,1551067188882116608,2022-08-03T21:58:42.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",0.0,Neutral
3,I wish invest Crypto I believe life turn aroun...,1554949901347332097,1338149444605386752,2022-08-03T21:58:40.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",0.5,Positive
4,The revolutionary token provides holders PASSI...,1554949895328792576,1550565274801537024,2022-08-03T21:58:39.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",0.0,Neutral


In [25]:
# Remove unwanted columns
tweets_df = remove_unwanted_cols(tweets_df, ['public_metrics','author_id','id'])
tweets_df = tweets_df[['text', 'sentiment_label', 'textblob_sentiment', 'created_at']]
tweets_df.head()

Unnamed: 0,text,sentiment_label,textblob_sentiment,created_at
0,IKOLF ikolf bsc bscgems 100x floki memecoin nf...,Neutral,0.0,2022-08-03T21:58:43.000Z
1,You ask blockchain btc etheth Solana Web30 Gam...,Positive,0.5,2022-08-03T21:58:42.000Z
2,Ethereum price update ETH 164474 USD Bitcoin 0...,Neutral,0.0,2022-08-03T21:58:42.000Z
3,I wish invest Crypto I believe life turn aroun...,Positive,0.5,2022-08-03T21:58:40.000Z
4,The revolutionary token provides holders PASSI...,Neutral,0.0,2022-08-03T21:58:39.000Z


In [None]:
# Create CSV from DF
myFile = open('twitter_data.csv', 'w', errors='ignore')
writer = csv.DictWriter(myFile, fieldnames=['text', 'id', 'author_id','created_at','public_metrics'])
writer.writeheader()
writer.writerows(twitter_json_response['data'])
myFile.close()

### Split dataset into Train, Test

In [32]:
# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(tweets_df.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(tweets_df.iloc[:, 1]).ravel())
y = np.array(tweets_df.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)


In [33]:
# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.05


In [34]:
# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.05


# Play ground