# Assignment 5 


Zach Novak, Marco Bogani, Sulaiman Karmali, Daman Sawhney, Ivan Lima

In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk import download

In [27]:
# Download necessary NLTK resources

In [None]:
download('punkt')
download('wordnet')
download('stopwords')

1. Load the provided dataset containing financial news headlines and sentiment labels. Perform exploratory data analysis to understand the structure of the dataset, distribution of sentiment labels, and any other relevant insights. ( 5 points )

In [28]:
# Load the Loughran-McDonald Master Dictionary

In [None]:
data = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2023.csv')

In [None]:
# Load the CNBC headlines dataset

In [None]:
cnbc_data = pd.read_csv('cnbc_headlines2.csv')

2. Clean the text data by removing punctuation, special characters, and irrelevant symbols. Tokenize the headlines and convert them to lowercase for uniformity. Implement techniques like stemming or lemmatization to normalize the text data. ( 5 points )

In [None]:
# Drop rows with NaN values in 'Headlines'

In [None]:
cnbc_data.dropna(subset=['Description'], inplace=True)

In [None]:
# Filter positive and negative words

In [None]:
positive_words = data[data['Positive'] > 0]['Word'].str.lower().unique()
negative_words = data[data['Negative'] > 0]['Word'].str.lower().unique()

In [None]:
# Define a function to analyze sentiment

In [None]:
def analyze_sentiment(description):
    tokens = word_tokenize(str(description).lower())
    positive_count = sum(token in positive_words for token in tokens)
    negative_count = sum(token in negative_words for token in tokens)
    if positive_count > negative_count:
        return 'Positive'
    elif negative_count > positive_count:
        return 'Negative'
    else:
        return 'Neutral'

In [None]:
# Clean and preprocess text data

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(text):
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Tokenize text
    tokens = word_tokenize(text)
    # Lowercase, lemmatize, and remove stopwords
    cleaned_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    return ' '.join(cleaned_tokens)

In [None]:
# Apply cleaning and sentiment analysis

In [None]:
cnbc_data['Cleaned_Description'] = cnbc_data['Description'].apply(clean_text)
cnbc_data['Sentiment'] = cnbc_data['Cleaned_Description'].apply(analyze_sentiment)

3. Convert the text data into numerical features suitable for machine learning models. You can use techniques like bag-of-words, TF-IDF, or word embeddings. Split the dataset into training and testing sets. ( 5 points )

In [None]:
# Convert text data into numerical features using the 'bag-of-words' approach

In [None]:
vectorizer = CountVectorizer(max_features=1000)
features = vectorizer.fit_transform(cnbc_data['Cleaned_Description']).toarray()

In [None]:
# Convert sentiment labels to numerical format

In [None]:
sentiment_mapping = {'Positive': 1, 'Negative': -1, 'Neutral': 0}
labels = cnbc_data['Sentiment'].map(sentiment_mapping).values

In [None]:
# Split the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

4. Choose appropriate machine learning algorithms (e.g., Naive Bayes, Support Vector Machines, or Neural Networks) for sentiment analysis. Train the model using the training data and evaluate its performance using appropriate evaluation metrics (accuracy, precision, recall, F1-score). ( 5 points )

In [None]:
# Using 'Naive Bayes' to train the machine learning model

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
# Evaluate the model

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

In [None]:
# Constructed the vocabulary of the bag-of-words model

In [None]:
print("\n**Vocabulary of the bag-of-words model**")
print(vectorizer.vocabulary_)


In [None]:
# Index positions

In [None]:
print("\n**Index positions of vocabulary**")
print(features)


In [None]:
# Perform exploratory data analysis on the sentiment distribution

In [None]:
print("\n**Sentiment Count**")
sentiment_counts = cnbc_data['Sentiment'].value_counts()

In [None]:
print(sentiment_counts)
print("\n**Performance**")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')