In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\osi0pr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\osi0pr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\osi0pr\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\osi0pr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [43]:
#import data to prepare for sentiment analysis
data = pd.read_csv('data/finance_stocks/finance_stocks-train.csv')

In [39]:
#inspect the relevant data column and look for issues
print('Dataset size:',data.shape)
print('Dataset columns:',data.columns)
#remove any rows with missing data
data = data.dropna()
data.info()
data.head()

Dataset size: (14363, 8)
Dataset columns: Index(['timestamp', 'tweet_text', 'tweet_url', 'tweet_type', 'price_of_ticker',
       'change_of_ticker', 'tickers_mentioned', 'category'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 14363 entries, 0 to 22382
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   timestamp          14363 non-null  object
 1   tweet_text         14363 non-null  object
 2   tweet_url          14363 non-null  object
 3   tweet_type         14363 non-null  object
 4   price_of_ticker    14363 non-null  object
 5   change_of_ticker   14363 non-null  object
 6   tickers_mentioned  14363 non-null  object
 7   category           14363 non-null  object
dtypes: object(8)
memory usage: 1009.9+ KB


Unnamed: 0,timestamp,tweet_text,tweet_url,tweet_type,price_of_ticker,change_of_ticker,tickers_mentioned,category
0,2023-11-15T09:36:42.028000+00:00,$GOLD's Massive Range.\n\nIn the past ~year we...,https://twitter.com/user/status/17247221551437...,tweet,['15.71'],['+3.69%'],['$GOLD'],stock_images
1,2023-11-15T06:01:59.788000+00:00,RT @SmartReversals: $SPX - Daily Chart:\n\n✅Ta...,https://twitter.com/user/status/17246687922221...,retweet,['4495.71'],['+1.87%'],['$SPX'],stock_images
2,2023-11-15T06:01:55.590000+00:00,RT @SmartReversals: $NDX - Daily Chart:\n\n✅Ta...,https://twitter.com/user/status/17246687824535...,retweet,['15812.473'],['+2.08%'],['$NDX'],stock_images
3,2023-11-15T06:01:51.329000+00:00,RT @SmartReversals: $IWM - Daily Chart:\n\n✅Ta...,https://twitter.com/user/status/17246687591081...,retweet,['178.46'],['+5.21%'],['$IWM'],stock_images
4,2023-11-15T04:01:49.009000+00:00,RT @coiledspringcap: Everyone has been concern...,https://twitter.com/user/status/17246376898767...,retweet,['4495.71'],['+1.87%'],['$SPX'],stock_images


In [52]:
def clean_tweets_column(df, column_name, cleaned_column):
    def clean_tweet(tweet):
        # Convert to string if it's not already
        tweet = str(tweet)
        # Remove hyperlinks
        tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
        # Remove user @ references and '#' from hashtags
        tweet = re.sub(r'\@\w+|\#','', tweet)
        # Remove emojis
        tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
        # Remove new lines
        tweet = tweet.replace('\n', ' ')
        # Remove extra spaces
        tweet = re.sub(r'\s+', ' ', tweet).strip()
        # Convert to lowercase
        tweet = tweet.lower()
        # Remove special characters, numbers, and punctuations
        tweet = "".join([char for char in tweet if char not in string.punctuation])
        tweet = re.sub('[0-9]+', '', tweet)
        return tweet

    def remove_stopwords(tweet):
        stop_words = set(stopwords.words('english'))
        tweet_tokens = tweet.split()
        filtered_tweet = [word for word in tweet_tokens if word not in stop_words]
        return ' '.join(filtered_tweet)

    # Apply the clean_tweet function to the specified column
    df[cleaned_column] = df[column_name].astype(str).apply(clean_tweet)
    # Remove stop words
    df[cleaned_column] = df[cleaned_column].apply(remove_stopwords)

# Example usage
clean_tweets_column(data, 'tweet_text', 'clean_tweet_text')

In [61]:
# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Function to apply stemming to a text
def stem_text(text):
    tokens = word_tokenize(text)  # Tokenize the text
    stemmed_words = [stemmer.stem(word) for word in tokens]  # Stem each token
    return " ".join(stemmed_words)  # Join stemmed tokens back into a single string

# Apply stemming to the 'tweet_text_cleaned' column
data['stemmed_text'] = data['clean_tweet_text'].apply(stem_text)

# Display the original and stemmed text
print("Original Text:\n", data['clean_tweet_text'])
print("\nStemmed Text:\n", data['stemmed_text'])

Original Text:
 0        golds massive range past year weve swept lows ...
1        rt spx daily chart target bull move mentioned ...
2        rt ndx daily chart target bull move mentioned ...
3        rt iwm daily chart target bull move mentioned ...
4        rt everyone concerned concentration stockmarke...
                               ...                        
22378                                  trade dxy trade dxy
22379    get apology im glad agree dxy looking bad bear...
22380    ecb hawkish fed coming end hiking cycle bearis...
22381                         dxy monster intraday reclaim
22382                                dxy starting daily eq
Name: clean_tweet_text, Length: 22383, dtype: object

Stemmed Text:
 0        gold massiv rang past year weve swept low high...
1        rt spx daili chart target bull move mention be...
2        rt ndx daili chart target bull move mention be...
3        rt iwm daili chart target bull move mention la...
4        rt everyon concern co