# Creating a sentiment analysis model.
I am using a dataset of 1.6 million tweets found on Kaggle for the training dataset, its not strictly a data set that is made for stock sentiment analysis. There are some dataset that are restricted to stocks, I hypothesise that the same analysis will be applicable. I will test this by using these restricted datasets for validation. The dataset will not be in the repository as it is too large but I have included a link to it below. I have also drawn on on a medium tutorial on sentiment analysis in pytorch.

The dataset: https://www.kaggle.com/datasets/kazanova/sentiment140?resource=download

Medium Tutorial: https://bhadreshpsavani.medium.com/tutorial-on-sentimental-analysis-using-pytorch-b1431306a2d7

In [None]:
### import libraries
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
import json
import csv
from string import punctuation
from collections import Counter
 
# for printing out status reports
import sys

# for data visualization
import matplotlib.pyplot as plt


In [None]:
# use GPU prioitising apple silicon then nvidia cuda and lastly cpu. 
has_gpu = torch.cuda.is_available()
has_mps = getattr(torch,'has_mps',False)
device = "mps" if getattr(torch,'has_mps',False) \
    else "gpu" if torch.cuda.is_available() else "cpu"

print(device)

In [None]:
#Pull the data from the csv file, 

df = pd.read_csv('../Sentiment_Analysis_Strategy/training.1600000.processed.noemoticon.csv', header= None,encoding='latin-1' )

# We dont need most of hte columns such as tweet author or the date and time. 

df.drop([1,2,3,4], inplace=True, axis=1)

df.columns =['Sentiment','Tweet_text']

display(df.head())

In [None]:
# This function will be used in the next cell to remove all of the special characters in th tweets

def removespecial(tweet):
    
    #calling alpha because it is returning just the alphabet. 
    
    alpha = ""
    
    for ch in tweet:
        if ch not in punctuation:
            alpha = alpha + ch
            
    return alpha

In [None]:
# Next we need to remove the special characters from the tweets and we need to fix the words to remove caps etc. we could do this by looping through but it will just take far too long

df['Tweet_text'] = df['Tweet_text'].apply(lambda x: x.lower())
df['Tweet_text'] = df['Tweet_text'].apply(lambda x: removespecial(x))

In [None]:
# Check that we have the tweets in a format we are looking for. 

display(df.head())

In [None]:
# Next we need to take all of the words in the tweets and count the occurrences of each word. this is done so we can enumerate the words.
# We need to enumerate the words because our model later will need to take integers as its inputs. First we split the tweets into lists with all of the words
df['Tweet_words'] = df['Tweet_text'].apply(lambda x: x.split())

# Next we need one super string that has all of the tweets as one string. This step can take a very long time. So we will do it by making the counter work over every entry then adding those instead.

df['Tweets_counted'] = df['Tweet_words'].apply(lambda x: Counter(x))

display(df)


In [None]:
# This might seem like a slightly odd way to collect the text of all of the words, but this is a faster method compared to using sum over the text in all tweets
# It will split the tweets into groups of 1000 and run Counter over them. Aprox 10 minuites
collections = {}
divisions = 1600
for i in range(divisions):
    collections[f"{i}"] = df['Tweets_counted'].iloc[int(len(df.index)/divisions * i) : int(len(df.index)/divisions * (i+1))].sum()


In [None]:
# then combining those 1600 collections into one large mega collection of all words and how often they appear. 

word_count = sum(collections.values(),Counter())

In [None]:
# Sorting so the most common words are first

sorted_words = word_count.most_common(len(word_count))

In [None]:
# Enumerating the words in the list by how common they are.

Enumerated_words ={w:i+1 for i,(w,c) in enumerate(sorted_words)}

In [None]:
display(Enumerated_words)

In [None]:
# Now to encode the reviews themselves, this function will apply 0s if we encounter a word we haven't got, which we shouldn't and the enumeration of the word otherwise.

def Encode(tweet):
    
    encoded_tweet = []
    
    for word in tweet:
        if word not in Enumerated_words:
            encoded_tweet.append(0)
            
        else:
            encoded_tweet.append(Enumerated_words[word])
    return encoded_tweet


In [None]:
# Applying the encoding to the tweets in the dataframe

df['Encoded_Tweet'] = df['Tweet_words'].apply(lambda x: Encode(x))

display(df.head())

In [None]:
# Saving because we don't want to loose this progress

df.to_csv('../Sentiment_Analysis_Strategy/encoded_tweets.csv')