# Trade with feelings
## Twitter sentiment analysis

In [17]:
# Import required libraries

import requests
import os
import json
from dotenv import load_dotenv
import time
import pandas as pd
import csv

#import tweepy
#from twitter_authentication import bearer_token

import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob

# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
    
# Global Parameters
stop_words = set(stopwords.words('english'))


In [2]:
# Load the environment variables from the .env file by calling the load_dotenv function
load_dotenv("twitter.env")

True

In [3]:
# Get the API key from the environment variable and store as Python variable
consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_token = os.getenv("ACCESS_TOKEN")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")
bearer_token = os.getenv("BEARER_TOKEN")

type(bearer_token)

str

### Define funtions to reuse in the code

In [36]:
#search_url = "https://api.twitter.com/2/tweets/search/all"
search_url = "https://api.twitter.com/2/tweets/search/recent"

ticker_to_query = 'BTC'

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {'query': f'(#{ticker_to_query} -is:retweet)',
                'tweet.fields': 'created_at,public_metrics,text,author_id', 
                'max_results': '100',
               }

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FullArchiveSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.request("GET", url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

def preprocess_tweet_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    #ps = PorterStemmer()
    #stemmed_words = [ps.stem(w) for w in filtered_words]
    #lemmatizer = WordNetLemmatizer()
    #lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(filtered_words)

def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

def polarity_to_string(sentiment):
    if sentiment < 0:
        return "Negative"
    elif sentiment == 0:
        return "Neutral"
    else:
        return "Positive"

def get_sentiment(df, txt_col):
    return df[txt_col].map(lambda txt: TextBlob(txt).sentiment.polarity)


In [37]:
# Request data from Twitter
twitter_json_response = connect_to_endpoint(search_url, query_params)
#print(json.dumps(twitter_json_response, indent=4, sort_keys=True))

200


In [38]:
#type(twitter_json_response)
#type(twitter_json_response['data'])

print(json.dumps(twitter_json_response['data'][0], indent=4, sort_keys=True))

{
    "author_id": "1552242131632136192",
    "created_at": "2022-08-03T21:58:43.000Z",
    "id": "1554949914882641920",
    "public_metrics": {
        "like_count": 0,
        "quote_count": 0,
        "reply_count": 0,
        "retweet_count": 0
    },
    "text": "@superconnecter IKOLF #ikolf #bsc #bscgems #100x #floki #memecoin  #nfts #nft #1000x #bnb\u202f\u202f\u202f\u202f #btc\u202f\u202f\u202f #SHIB  #shiba #doge #bep20 @elonmusk  @1goonrich #cro\u202f\u202f\u202f\u202f#cryptocurrency #avax\u202f\u202f\u202f\u202f\u202f#eth"
}


In [40]:
myFile = open('twitter_data.csv', 'w', errors='ignore')
writer = csv.DictWriter(myFile, fieldnames=['text', 'id', 'author_id','created_at','public_metrics'])
writer.writeheader()
writer.writerows(twitter_json_response['data'])
myFile.close()

#myFile = open('twitter_data.csv', 'r')
#print("The content of the csv file is:")
#print(myFile.read())
#myFile.close()

### Process the data

In [9]:
# Convert Json to DataFrame
tweets_df = pd.read_json(json.dumps(twitter_json_response['data']))
tweets_df.head()

Unnamed: 0,text,id,author_id,created_at,public_metrics
0,@monkeygunz1 #AmazingLove to the #spookyshiba ...,1554936877873958912,4309139774,2022-08-03 21:06:55+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
1,@monkeygunz1 #AmazingLove to the #spookyshiba ...,1554936864611610624,4309139774,2022-08-03 21:06:52+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
2,Current #Bitcoin price is $23359.6 USD.\n\nFol...,1554936861491175424,1543816638692151296,2022-08-03 21:06:51+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
3,Uyumayan 100 kişi varmıdır ? 🖐️\n#btc #binance,1554936857431080960,1397362012456378368,2022-08-03 21:06:50+00:00,"{'retweet_count': 3, 'reply_count': 2, 'like_c..."
4,@MMF_Canada @MMFcrypto #AmazingLove to the #sp...,1554936855870820352,1552698277245624320,2022-08-03 21:06:50+00:00,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."


In [27]:
# Preprocess data
tweets_df.text = tweets_df['text'].apply(preprocess_tweet_text)
tweets_df.head()

Unnamed: 0,author_id,id,public_metrics,text,created_at,geo
0,2195266826,1553943651155431424,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Saitama SaitamaWolfPack SaitaPro Saita Crypto ...,2022-08-01 03:20:11+00:00,
1,230145424,1553943648819093504,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Breaking news BTC Crypto Binance Altcoins ADA ...,2022-08-01 03:20:11+00:00,
2,1125284714976141312,1553943647175036928,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",BTCJPY 3102007円 ETHJPY 223743円 XRPJPY 50463円 L...,2022-08-01 03:20:11+00:00,
3,1539096372560818176,1553943635582140416,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",The Theory Cosmopolitanism HyperNation Watch f...,2022-08-01 03:20:08+00:00,
4,2195266826,1553943623711981568,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",SaitamaWolfPack SaitaPro cryptocurrencies BTC ...,2022-08-01 03:20:05+00:00,


In [28]:
# Get the sentiment of the full_text and label it
tweets_df['textblob_sentiment'] = get_sentiment(tweets_df, 'text')
tweets_df['sentiment_label'] = tweets_df['textblob_sentiment'].apply(polarity_to_string)

tweets_df.head()

Unnamed: 0,author_id,id,public_metrics,text,created_at,geo,textblob_sentiment,sentiment_label
0,2195266826,1553943651155431424,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Saitama SaitamaWolfPack SaitaPro Saita Crypto ...,2022-08-01 03:20:11+00:00,,0.0,Neutral
1,230145424,1553943648819093504,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Breaking news BTC Crypto Binance Altcoins ADA ...,2022-08-01 03:20:11+00:00,,0.0,Neutral
2,1125284714976141312,1553943647175036928,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",BTCJPY 3102007円 ETHJPY 223743円 XRPJPY 50463円 L...,2022-08-01 03:20:11+00:00,,0.0,Neutral
3,1539096372560818176,1553943635582140416,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",The Theory Cosmopolitanism HyperNation Watch f...,2022-08-01 03:20:08+00:00,,0.35,Positive
4,2195266826,1553943623711981568,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",SaitamaWolfPack SaitaPro cryptocurrencies BTC ...,2022-08-01 03:20:05+00:00,,0.0,Neutral


In [29]:
# Remove unwanted columns
tweets_df = remove_unwanted_cols(tweets_df, ['public_metrics','geo','author_id','id'])
tweets_df = tweets_df[['text', 'sentiment_label', 'textblob_sentiment', 'created_at']]
tweets_df.head()

Unnamed: 0,text,sentiment_label,textblob_sentiment,created_at
0,Saitama SaitamaWolfPack SaitaPro Saita Crypto ...,Neutral,0.0,2022-08-01 03:20:11+00:00
1,Breaking news BTC Crypto Binance Altcoins ADA ...,Neutral,0.0,2022-08-01 03:20:11+00:00
2,BTCJPY 3102007円 ETHJPY 223743円 XRPJPY 50463円 L...,Neutral,0.0,2022-08-01 03:20:11+00:00
3,The Theory Cosmopolitanism HyperNation Watch f...,Positive,0.35,2022-08-01 03:20:08+00:00
4,SaitamaWolfPack SaitaPro cryptocurrencies BTC ...,Neutral,0.0,2022-08-01 03:20:05+00:00


### Split dataset into Train, Test

In [32]:
# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(tweets_df.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(tweets_df.iloc[:, 1]).ravel())
y = np.array(tweets_df.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)


In [33]:
# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.05


In [34]:
# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.05


# Play ground