<a href="https://colab.research.google.com/github/rithikkulkarni/ALDA-Course-Project/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Importing labeled data
labeled_df = pd.read_csv('https://raw.githubusercontent.com/rithikkulkarni/ALDA-Course-Project/refs/heads/main/datasets/labeled_stock_tweets.csv')
labeled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80793 entries, 0 to 80792
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          80793 non-null  object
 1   Tweet         80793 non-null  object
 2   Stock Name    80793 non-null  object
 3   Company Name  80793 non-null  object
 4   Label         80793 non-null  object
dtypes: object(5)
memory usage: 3.1+ MB


In [2]:
import pandas as pd

# Find the minimum count
min_count = labeled_df['Label'].value_counts().min()

# Create balanced DataFrame by sampling minimum instances from each group
balanced_df = labeled_df.groupby('Label').sample(n=min_count, random_state=42).reset_index(drop=True)

# Print count
print(balanced_df['Label'].value_counts())


Label
negative    8546
neutral     8546
positive    8546
Name: count, dtype: int64


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define stopwords from nltk
stop_words = set(stopwords.words('english'))

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_tweet(tweet, remove_stopwords=True, apply_stemming=True, apply_lemmatization=True):
    """
    Clean a tweet text by normalizing case, removing URLs, mentions, hashtags, numbers,
    special characters, and optionally applying stemming and lemmatization.

    Parameters:
        tweet (str): Tweet text to be cleaned
        remove_stopwords (bool): Whether to remove stopwords
        apply_stemming (bool): Whether to apply stemming to the words
        apply_lemmatization (bool): Whether to apply lemmatization to the words

    Returns:
        str: The cleaned tweet text
    """
    # Lowercase the tweet
    tweet = tweet.lower()

    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)

    # Replace @ mentions with a placeholder
    tweet = re.sub(r'@\w+', 'USER', tweet)

    # Remove hashtags but keep the words
    tweet = re.sub(r'#', '', tweet)

    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)

    # Remove punctuation and special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)

    # Shrink words with overused repeated letters (e.g., "soooo" -> "so")
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    # Tokenize the tweet into words
    words = tweet.split()

    # Remove stopwords
    if remove_stopwords:
        words = [word for word in words if word not in stop_words]

    # Apply stemming
    if apply_stemming:
        words = [ps.stem(word) for word in words]

    # Apply lemmatization
    if apply_lemmatization:
        words = [lemmatizer.lemmatize(word) for word in words]

    # Rejoin the words into a single string
    return ' '.join(words)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
labeled_df['Cleaned_Tweet'] = labeled_df['Tweet'].apply(lambda x: clean_tweet(str(x)))

In [9]:
labeled_df.head(50)

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Label,Cleaned_Tweet
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",neutral,mainstream medium done amaz job brainwash peop...
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",neutral,tesla deliveri estim around k analyst tsla
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",neutral,even includ unvest rsu addit equiti need rsu x...
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",neutral,user user user hahaha still tri stop tesla fsd...
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",negative,user user stop tri kill kid sad derang old man
5,2022-09-29 22:25:53+00:00,@RealDanODowd @Tesla This is you https://t.co/...,TSLA,"Tesla, Inc.",neutral,user user
6,2022-09-29 22:24:22+00:00,For years @WholeMarsBlog viciously silenced @T...,TSLA,"Tesla, Inc.",neutral,year user vicious silenc user critic fail sile...
7,2022-09-29 22:23:54+00:00,$NIO just because I'm down money doesn't mean ...,TSLA,"Tesla, Inc.",neutral,nio im money doesnt mean bad invest whole mark...
8,2022-09-29 22:23:28+00:00,50 likes for some $SPY $TSLA charts to study!\...,TSLA,"Tesla, Inc.",neutral,like spi tsla chart studi
9,2022-09-29 22:15:01+00:00,@MrJames__321 @KellyRoofing @TeslaSolar @elonm...,TSLA,"Tesla, Inc.",negative,user user user user user powerwal waterproof c...


In [11]:
# Find the minimum count across the classes
min_count = labeled_df['Label'].value_counts().min()

# Create a balanced DataFrame by sampling min_count instances from each group
balanced_df = labeled_df.groupby('Label').sample(n=min_count, random_state=42).reset_index(drop=True)

# Display the count of each class to verify balance
print(balanced_df['Label'].value_counts())

Label
negative    8546
neutral     8546
positive    8546
Name: count, dtype: int64


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Divide dataframe into explanatory and response variable, randomly sampled balanced subset of dataframe
X = balanced_df['Cleaned_Tweet']
y = balanced_df['Label']

# Vectorize tweets using TF-IDF
vectorizer = TfidfVectorizer()
X_features = vectorizer.fit_transform(X)

# 20% train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42
)

# Train logistic regression model on data
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Step 5: Evaluate the model on the test set
y_pred = lr_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7320592823712948
Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.74      0.76      1738
     neutral       0.69      0.69      0.69      1693
    positive       0.73      0.76      0.75      1697

    accuracy                           0.73      5128
   macro avg       0.73      0.73      0.73      5128
weighted avg       0.73      0.73      0.73      5128



In [None]:
### To be used for stock price

import yfinance as yf

ticker = yf.Ticker("AAPL")
hist = ticker.history(start="2024-02-05", end="2024-02-12")
print(hist)