In [23]:
import pandas as pd
from nltk.stem import PorterStemmer
import re
import os
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
folder_path = "/Users/reroreo1/Desktop/p00_tweets/"
files = ['processedPositive.csv', 'processedNeutral.csv', 'processedNegative.csv']



stop_words = set(stopwords.words("english"))


def clean_text(text):
    # Convert to lowercase, remove numbers, quotes, and strip leading/trailing spaces
    text = re.sub(r"\d+", "", text.lower()).replace("'", "").replace('"', "")
    # Add newlines where a comma is not followed by a space
    text = re.sub(r",(?=\S)", "\n", text)
    # Split by lines and filter out short lines (less than 2 words)
    return [line.strip() for line in text.split("\n") if len(line.split()) >= 2]
    
def normalize_repeated_letters(text):
    # Replace 3+ repeated letters with a single occurrence
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # e.g., "happyyyy" -> "happi"
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # Remove non-alphabetic characters
    return text


negations = {"not", "no", "nor", "never"}

def remove_stop_words(words):
    cleaned_list = [word for word in words if word not in stop_words or word in negations]
    return cleaned_list

def stemmatization(words):
    stem_list = [stemmer.stem(word) for word in words]
    return ('').join(stem_list)

def sentiment_to_numbers(text):
    if text == "positive":
        return 4
    elif text == "negative":
        return 0
    else:
        return 2

def get_wordnet_pos(tag):
    if tag.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif tag.startswith('V'):  # Verb
        return wordnet.VERB
    elif tag.startswith('N'):  # Noun
        return wordnet.NOUN
    elif tag.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN



def lemmatize_pos(words):
    tags = nltk.pos_tag(words)
    lem_list = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tags]
    return (' ').join(lem_list)

# Process all files
tweets = {}
for file in files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r') as f:
        raw_text = f.read()
    # Clean the text and remove duplicates
    text = clean_text(raw_text)
    processed_text = list(set(text))
    # Save processed tweets in the dictionary
    key = file.replace("processed", "").replace(".csv", "").lower()
    tweets[key] = processed_text
    
# tweets

# Output: A dictionary with cleaned and filtered tweets
data = []
for sentiment, tweet_list in tweets.items():
    for tweet in tweet_list:
        data.append((tweet, sentiment))

# Create a DataFrame
df = pd.DataFrame(data, columns=["text", "sentiment"])
df["text"] = df["text"].apply(normalize_repeated_letters)
# use nltk word tokenizer to split the cleaned text to tokens 
df["text"] = df["text"].apply(word_tokenize)
# #remove stop words to avoid noise
df["text"] = df["text"].apply(remove_stop_words)
#apply stemmatization on the list of words

df['lemmatized_text'] = df["text"].apply(lemmatize_pos)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features

# Transform the text into TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(df['lemmatized_text'])

X = tfidf_features  # TF-IDF features
y = df['sentiment']  # Sentiment labels
print(df.shape)
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Compare the true labels (y_test) with the predicted labels (y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}") 
# df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


(2645, 3)
Training Data Shape: (2116, 5000)
Testing Data Shape: (529, 5000)
Classification Report:
              precision    recall  f1-score   support

    negative       0.99      0.89      0.94       180
     neutral       0.84      0.98      0.90       188
    positive       0.95      0.86      0.90       161

    accuracy                           0.91       529
   macro avg       0.93      0.91      0.92       529
weighted avg       0.92      0.91      0.92       529

Confusion Matrix:
[[161  13   6]
 [  1 185   2]
 [  0  23 138]]
Accuracy: 0.91
