In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from tqdm import tqdm
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler 
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers

# Load the data

In [None]:
# reading tweets data
df=pd.read_csv('../input/bitcoin-tweets/train_data_inference.csv')
df_test=pd.read_csv('../input/bitcoin-tweets/test_data_inference.csv')

In [None]:
df=df.drop(['Unnamed: 0','username', 'date', 'country', 'replyCount', 'retweetCount', 'likeCount', 'url', 'textblob_class', 'vader_class'],axis=1)
df_new = df
df= df.drop(['content', 'inference'], axis =1)
df_test=df_test.drop(['Unnamed: 0','username', 'date', 'country', 'inference', 'content','replyCount', 'retweetCount', 'likeCount', 'url', 'textblob_class', 'vader_class'],axis=1)

In [None]:
df.head()

In [None]:
df_new.head()

In [None]:
df_test.head()

In [None]:
# getting value counts for each sentiment in train df
print("Label   Count")
df['sentiment'].value_counts()

# SVM

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# create feature vectors using tfidf
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(df['processed_content'])
test_vectors = vectorizer.transform(df_test['processed_content'])

In [None]:
# fit the svm models
linear = svm.SVC(kernel='linear').fit(train_vectors, df['sentiment'])
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(train_vectors, df['sentiment'])

In [None]:
# make predictions
linear_pred = linear.predict(test_vectors)
poly_pred = poly.predict(test_vectors)

In [None]:
# retrieve the accuracy and print it for all 4 kernel functions
accuracy_score=round(accuracy_score(linear_pred,df_test['sentiment']), 3)
precision = round(precision_score(df_test['sentiment'],linear_pred, average="weighted"), 3)
recall = round(recall_score(df_test['sentiment'],linear_pred, average="weighted"), 3)
f1_score = round(f1_score(df_test['sentiment'], linear_pred, average="weighted"), 3)

print('Linear Kernel:')
print('Accuracy:   ', accuracy_score)
print('Precision:  ', precision)
print('Recall:     ', recall)
print('F1 Score:   ', f1_score)

In [None]:
# retrieve the accuracy and print it for all 4 kernel functions
accuracy_score=round(accuracy_score(rbf_pred,df_test['sentiment']), 3)
precision = round(precision_score(df_test['sentiment'],rbf_pred, average="weighted"), 3)
recall = round(recall_score(df_test['sentiment'],rbf_pred, average="weighted"), 3)
f1_score = round(f1_score(df_test['sentiment'], rbf_pred, average="weighted"), 3)

print('RBF Kernel:')
print('Accuracy:   ', accuracy_score)
print('Precision:  ', precision)
print('Recall:     ', recall)
print('F1 Score:   ', f1_score)

In [None]:
# plot confusion matrix
plot_confusion_matrix(linear, test_vectors, df_test['sentiment']) 
plt.show()

In [None]:
plot_confusion_matrix(rbf, test_vectors, df_test['sentiment']) 
plt.show()

# Enhancement models and analysis

## 1. PCA with basic classifiers
##### code reference: https://www.kaggle.com/code/tomras/sentiment-analysis-of-tweets-using-pca-and-ml/notebook

In [None]:
# create feature vectors using tfidf
vectorizer = TfidfVectorizer()
text_features_train = vectorizer.fit_transform(df['processed_content'])
text_features_train.shape

In [None]:
# declaring PCA with 3 components
pca = PCA(n_components=3)
features_train = pca.fit_transform(text_features_train.toarray())
features_train.shape

In [None]:
# adding pca components to df
df_features_train = pd.DataFrame(features_train)
df_features_train = pd.concat([df_features_train, df[['sentiment']]], axis=1, ignore_index=True)
df_features_train.columns = ['pca_1', 'pca_2', 'pca_3', 'target'] #'pca_3',
df_features_train.describe(include='all')

In [None]:
# plotting the scatter plot to see how we separated are the 3 sentiments in the extracted PCA components
cmap = {0: 'red', 1: 'blue', -1: 'green'}
df_features_train.plot(kind='scatter', x='pca_1', y='pca_2', c=[cmap.get(t, 'black') for t in df_features_train['target']])

In [None]:
# for reproducability of the results
np.random.seed(42)
rndperm = np.random.permutation(df.shape[0])

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca_1", y="pca_2",
    hue="target",
    data=df_features_train.loc[rndperm,:],
    legend="full",
    alpha=0.3
)

In [None]:
# 3d plot
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
ax.scatter(
    xs=df_features_train.loc[rndperm,:]["pca_1"], 
    ys=df_features_train.loc[rndperm,:]["pca_2"], 
    zs=df_features_train.loc[rndperm,:]["pca_3"], 
    c=df.loc[rndperm,:]["sentiment"], 
    cmap='tab10'
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

## 2. Analysing test data misclassifications

In [None]:
count = 0
for i, j, index in zip(df_new.sentiment, df_new.inference, df_new.index):
    if(int(i) == int(j)):
        df_new = df_new.drop(labels=[index], axis=0)
        count = count +1
print(count)

In [None]:
len(df_new)

In [None]:
for content, ind in zip(df_new.processed_content, df_new.index):
    if 'bearish' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

In [None]:
for content, ind in zip(df_new.content, df_new.index):
    if 'bearish' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

In [None]:
for content, ind in zip(df_new.processed_content, df_new.index):
    if 'fall' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

In [None]:
for content, ind in zip(df_new.content, df_new.index):
    if 'fall' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

In [None]:
for content, ind in zip(df_new.processed_content, df_new.index):
    if 'down' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

In [None]:
for content, ind in zip(df_new.content, df_new.index):
    if 'down' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

In [None]:
for content, ind in zip(df_new.processed_content, df_new.index):
    if 'volatility' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

In [None]:
for content, ind in zip(df_new.content, df_new.index):
    if 'volatility' in content:
        print(df_new.sentiment[ind], df_new.inference[ind])

## 3. Data augmentation
##### code reference: https://github.com/kothiyayogesh/medium-article-code/blob/master/How%20I%20dealt%20with%20Imbalanced%20text%20dataset/data_augmentation_using_word_embedding.ipynb

In [None]:
tqdm.pandas()
np.random.seed(100)
# read file
file_name = '../input/bitcoin-tweets/train_data_inference.csv'
# read file using pandas
df = pd.read_csv(file_name)

In [None]:
def loadEmbeddingMatrix(typeToLoad, vocab_dict):
    import gensim.models.keyedvectors as word2vec
    import gc

    # load different embedding file from Kaggle depending on which embedding
    # matrix we are going to experiment with
    if (typeToLoad == "gloveTwitter50d"):
        EMBEDDING_FILE = 'embeddings\glove-twitter-27b-50d/glove.twitter.27B.50d.txt'
        embed_size = 50
    elif (typeToLoad == "word2vec"):
        word2vecDict = word2vec.KeyedVectors.load_word2vec_format( "embeddings\GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin", binary=True)
        embed_size = 300
    elif (typeToLoad == "fasttext"):
        EMBEDDING_FILE = 'embeddings\\fasttext/wiki.simple.vec'
        embed_size = 300
    elif (typeToLoad == "glove840B300D"):
        EMBEDDING_FILE = '../input/embeddings/glove.840B.300d.txt'
        embed_size = 300
    elif (typeToLoad == "glove6B300D"):
        EMBEDDING_FILE = 'embeddings\glove.6B\glove.6B.300d.txt'
        embed_size = 300
    elif (typeToLoad == "paragram"):
        EMBEDDING_FILE = 'embeddings\paragram_300_sl999\paragram_300_sl999.txt'
        embed_size = 300
    elif (typeToLoad == "wikiNews"):
        EMBEDDING_FILE = "embeddings\wiki-news-300d-1M\wiki-news-300d-1M.vec"
        embed_size = 300

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    if (typeToLoad in ["gloveTwitter50d", "fasttext"]):
        embeddings_index = dict()
        # Transfer the embedding weights into a dictionary by iterating through every line of the file.
        f = open(EMBEDDING_FILE)
        for line in f:
            # split up line into an indexed array
            values = line.rstrip().rsplit(' ')  # line.split()
            # first index is word
            word = values[0]
            # store the rest of the values in the array as a new array
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs  # 50 dimensions
        f.close()
    elif (typeToLoad in ["glove840B300D", "paragram", "glove6B300D"]):
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='latin'))
    elif (typeToLoad in ["wikiNews"]):
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o) > 100)
    else:
        embeddings_index = dict()
        for word in word2vecDict.wv.vocab:
            embeddings_index[word] = word2vecDict.word_vec(word)
    print('Loaded %s word vectors.' % len(embeddings_index))

    gc.collect()
    # We get the mean and standard deviation of the embedding weights so that we could maintain the
    # same statistics for the rest of our own random generated weights.
    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    nb_words = len(vocab_dict)
    # We are going to set the embedding size to the pretrained dimension as we are replicating it.
    # the size will be Number of Words in Vocab X Embedding Size
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    gc.collect()

    # With the newly created embedding matrix, we'll fill it up with the words that we have in both
    # our own dictionary and loaded pretrained embedding.
    embeddedCount = 0
    for word, i in vocab_dict.items():
        #i -= 1
        # then we see if this word is in glove's dictionary, if yes, get the corresponding weights
        embedding_vector = embeddings_index.get(word)
        # and store inside the embedding matrix that we will train later on.
        if embedding_vector is not None:
            try :
                embedding_matrix[i] = embedding_vector
                embeddedCount += 1
            except IndexError:
                pass
    print('total embedded:', embeddedCount, 'common words')

    del embeddings_index
    gc.collect()

    # finally return the embedding matrix
    return embedding_matrix

In [None]:
# tokenizing sentence for finding synonym
def make_tokenizer(texts):
    from keras.preprocessing.text import Tokenizer
    t = Tokenizer()
    t.fit_on_texts(texts)
    return t

tokenizer = make_tokenizer(df['content'])

In [None]:
# dictionary of word index
index_word = {}
for word in tokenizer.word_index.keys():
    index_word[tokenizer.word_index[word]] = word

vocab_dict = tokenizer.word_index

In [None]:
# loading word embedding
embed_mat = loadEmbeddingMatrix("glove840B300D", vocab_dict)
print("Embedding loaded")

In [None]:
from sklearn.neighbors import NearestNeighbors

synonyms_number = 5
word_number = 20000

nn = NearestNeighbors(n_neighbors=synonyms_number+1).fit(embed_mat)

neighbours_mat = nn.kneighbors(embed_mat[1:word_number])[1]

synonyms = {x[0]: x[1:] for x in neighbours_mat}

In [None]:
# finding nearby synonym - Basically it's not actually synonym. It's near by words of targetted word. 
import nltk
from nltk.corpus import wordnet

synonym = {}
for x in range(0,100):
    try :
        synonym.update({index_word[x] : [index_word[synonyms[x][i]] for i in range(synonyms_number-1)]})
    except :
        pass

In [None]:
# can only change words for selected part of speech to preserve semantic meaning.

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

def get_pos_tag (word, tagged) :
    res = [(x, y) for x, y in tagged if x == word]
    return res[0][1]

# Load the pretrained neural net
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
dict_syn = {}

for message, ind, senti in zip(df["content"], df.index, df['sentiment']):
    temp_dict={}
    count = 0
    if(senti == 1): # choose only pos tweets
        # Tokenize the text
        tokenized = tokenizer.tokenize(message)

        # Get the list of words from the entire text
        words = word_tokenize(message)

        # Identify the parts of speech
        tagged = nltk.pos_tag(words, tagset="universal")

        replacements = []

        for word in words:
            synonym = []
            antonyms = []
            word_index = vocab_dict.get(word, None)

            pos_tag = get_pos_tag(word, tagged)
            if (word_index and pos_tag in ["ADJ", "ADV", "NOUN", "VERB"] and word not in nltk.corpus.stopwords.words('english')) :
                for syn in wordnet.synsets(word, eval("wordnet." + pos_tag)):
                    for l in syn.lemmas() :
                        if(l.name() in [index_word[synonyms[word_index][i]] for i in range(synonyms_number-1)]):
                            synonym.append(l.name())

            if (synonym) :
                count = 1
                temp_dict.update({word: set(synonym)})

        if count == 1:
            dict_syn.update( {ind : temp_dict} )

              

In [None]:
tweets_to_make = 1539-760
tweets_to_make

In [None]:
list_new_tweets = []

In [None]:
# generating augmented tweets from the synonym dict created earlier
flag = 0
while(flag==0):
    for k in list(dict_syn):
        count = 0
        if tweets_to_make >0:
            for i in dict_syn[k]:
                if len(dict_syn[k][i]) >1:
                    count = 1

            tweet = df['content'][k]
            for i in dict_syn[k]:
                word_to_replace = i
                if count == 1:
                    if len(dict_syn[k][i]) >1 :
                        tweet = tweet.replace(word_to_replace, dict_syn[k][i].pop())
                    else: # set has only one value
                        tweet = tweet.replace(word_to_replace, next(iter(dict_syn[k][i])))

                if count == 0:
                    tweet = tweet.replace(word_to_replace, dict_syn[k][i].pop())

            if count == 0:
                del dict_syn[k]

            tweets_to_make = tweets_to_make -1
            list_new_tweets.append(tweet)
            if(len(dict_syn)==0):
                flag =1
                break
        else:
            flag = 1
            break
    
    print("Remaining tweets: ", tweets_to_make)

In [None]:
df_aug_pos = pd.DataFrame(list_new_tweets, columns=['content'])

In [None]:
df_aug_pos['sentiment'] = 1

In [None]:
df_aug_pos.head()

In [None]:
dict_syn = {}

for message, ind, senti in zip(df["content"], df.index, df['sentiment']):
    temp_dict={}
    count = 0
    if(senti == -1): # choose only pos tweets
        # Tokenize the text
        tokenized = tokenizer.tokenize(message)

        # Get the list of words from the entire text
        words = word_tokenize(message)

        # Identify the parts of speech
        tagged = nltk.pos_tag(words, tagset="universal")

        replacements = []

        for word in words:
            synonym = []
            antonyms = []
            word_index = vocab_dict.get(word, None)

            pos_tag = get_pos_tag(word, tagged)
            if (word_index and pos_tag in ["ADJ", "ADV", "NOUN", "VERB"] and word not in nltk.corpus.stopwords.words('english')) :
                for syn in wordnet.synsets(word, eval("wordnet." + pos_tag)):
                    for l in syn.lemmas() :
                        if(l.name() in [index_word[synonyms[word_index][i]] for i in range(synonyms_number-1)]):
                            synonym.append(l.name())

            if (synonym) :
                count = 1
                temp_dict.update({word: set(synonym)})

        if count == 1:
            dict_syn.update( {ind : temp_dict} )
              

In [None]:
df['sentiment'].value_counts()

In [None]:
tweets_to_make = 1539-535
tweets_to_make

In [None]:
list_new_tweets = []

In [None]:
flag = 0
while(flag==0):
    for k in list(dict_syn):
        count = 0
        if tweets_to_make >0:
            for i in dict_syn[k]:
                if len(dict_syn[k][i]) >1:
                    count = 1

            tweet = df['content'][k]
            for i in dict_syn[k]:
                word_to_replace = i
                if count == 1:
                    if len(dict_syn[k][i]) >1 :
                        tweet = tweet.replace(word_to_replace, dict_syn[k][i].pop())
                    else: # set has only one value
                        tweet = tweet.replace(word_to_replace, next(iter(dict_syn[k][i])))

                if count == 0:
                    tweet = tweet.replace(word_to_replace, dict_syn[k][i].pop())

            if count == 0:
                del dict_syn[k]

            tweets_to_make = tweets_to_make -1
            list_new_tweets.append(tweet)
            if(len(dict_syn)==0):
                flag =1
                break
        else:
            flag = 1
            break
    
    print("Remaining tweets: ", tweets_to_make)

In [None]:
df_aug_neg = pd.DataFrame(list_new_tweets, columns=['content'])

In [None]:
df_aug_neg['sentiment'] = -1

In [None]:
df_aug_neg.head()

In [None]:
# concatenating the positive and negative augmented tweets
df_augmented = pd.concat([df_aug_pos, df_aug_neg], ignore_index=True)

In [None]:
df_augmented.head()

In [None]:
df_augmented['sentiment'].value_counts()

In [None]:
df_augmented.to_csv("./train_augmented_only.csv")

In [None]:
df.head()

In [None]:
df_relavent = pd.DataFrame(df['content'])

In [None]:
df_relavent['sentiment'] = df['sentiment']

In [None]:
df_relavent.head()

In [None]:
df_augmented_all = pd.concat([df_relavent, df_augmented], ignore_index=True)

In [None]:
df_augmented_all.head()

In [None]:
len(df_augmented_all)

In [None]:
df_augmented_all['sentiment'].value_counts()

In [None]:
df_augmented_all.to_csv("./train_augmented_all.csv")