<a href="https://colab.research.google.com/github/rithikkulkarni/ALDA-Course-Project/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Importing labeled data
labeled_df = pd.read_csv('https://raw.githubusercontent.com/rithikkulkarni/ALDA-Course-Project/refs/heads/main/datasets/labeled_stock_tweets.csv')
labeled_df.info()

In [None]:
import pandas as pd

# Find the minimum count
min_count = labeled_df['Label'].value_counts().min()

# Create balanced DataFrame by sampling minimum instances from each group
balanced_df = labeled_df.groupby('Label').sample(n=min_count, random_state=42).reset_index(drop=True)

# Print count
print(balanced_df['Label'].value_counts())


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define stopwords from nltk
stop_words = set(stopwords.words('english'))

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_tweet(tweet, remove_stopwords=True, apply_stemming=True, apply_lemmatization=True):
    """
    Clean a tweet text by normalizing case, removing URLs, mentions, hashtags, numbers,
    special characters, and optionally applying stemming and lemmatization.

    Parameters:
        tweet (str): Tweet text to be cleaned
        remove_stopwords (bool): Whether to remove stopwords
        apply_stemming (bool): Whether to apply stemming to the words
        apply_lemmatization (bool): Whether to apply lemmatization to the words

    Returns:
        str: The cleaned tweet text
    """
    # Lowercase the tweet
    tweet = tweet.lower()

    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)

    # Replace @ mentions with a placeholder
    tweet = re.sub(r'@\w+', 'USER', tweet)

    # Remove hashtags but keep the words
    tweet = re.sub(r'#', '', tweet)

    # Remove numbers
    tweet = re.sub(r'\d+', '', tweet)

    # Remove punctuation and special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)

    # Shrink words with overused repeated letters (e.g., "soooo" -> "so")
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    # Tokenize the tweet into words
    words = tweet.split()

    # Remove stopwords
    if remove_stopwords:
        words = [word for word in words if word not in stop_words]

    # Apply stemming
    if apply_stemming:
        words = [ps.stem(word) for word in words]

    # Apply lemmatization
    if apply_lemmatization:
        words = [lemmatizer.lemmatize(word) for word in words]

    # Rejoin the words into a single string
    return ' '.join(words)


In [None]:
labeled_df['Cleaned_Tweet'] = labeled_df['Tweet'].apply(lambda x: clean_tweet(str(x)))

In [None]:
labeled_df.head(50)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Divide dataframe into explanatory and response variable, randomly sampled balanced subset of dataframe
X = balanced_df['Cleaned_Tweet']
y = balanced_df['Label']

# Vectorize tweets using TF-IDF
vectorizer = TfidfVectorizer()
X_features = vectorizer.fit_transform(X)

# 20% train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, random_state=42
)

# Train logistic regression model on data
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Step 5: Evaluate the model on the test set
y_pred = lr_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
### To be used for stock price

import yfinance as yf

ticker = yf.Ticker("AAPL")
hist = ticker.history(start="2024-02-05", end="2024-02-12")
print(hist)