In [48]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.metrics.distance  import edit_distance 
from collections import Counter
import re
import os
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

In [49]:
nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
folder_path = "/Users/reroreo1/Desktop/p00_tweets/"
files = ['processedPositive.csv', 'processedNeutral.csv', 'processedNegative.csv']
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/reroreo1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [50]:
def clean_text(text):
    # Convert to lowercase, remove numbers, quotes, and strip leading/trailing spaces
    text = re.sub(r"\d+", "", text.lower()).replace("'", "").replace('"', "")
    # Add newlines where a comma is not followed by a space
    text = re.sub(r",(?=\S)", "\n", text)
    # Split by lines and filter out short lines (less than 2 words)
    return [line.strip() for line in text.split("\n") if len(line.split()) >= 2]
    
def normalize_repeated_letters(text):
    # Replace 3+ repeated letters with a single occurrence
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # e.g., "happyyyy" -> "happi"
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # Remove non-alphabetic characters
    return text
    
negations = {"not", "no", "nor", "never"}

def remove_stop_words(words):
    cleaned_list = [word for word in words if word not in stop_words or word in negations]
    return cleaned_list
    
def count_words(tweet):
    return Counter(tweet)

def stemmatization(words):
    stem_list = [stemmer.stem(word) for word in words]
    return ('').join(stem_list)

def sentiment_to_numbers(text):
    if text == "positive":
        return 4
    elif text == "negative":
        return 0
    else:
        return 2

def get_wordnet_pos(tag):
    if tag.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif tag.startswith('V'):  # Verb
        return wordnet.VERB
    elif tag.startswith('N'):  # Noun
        return wordnet.NOUN
    elif tag.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN
from collections import Counter

# Function to count words for each sentiment category
def get_most_frequent_words(df, sentiment, n=10):
    """
    Get the most frequent words in tweets of a specific sentiment.

    Args:
        df (DataFrame): The DataFrame containing 'text' and 'sentiment'.
        sentiment (str): The sentiment category (e.g., 'positive').
        n (int): The number of top words to return.

    Returns:
        List of tuples: Top `n` words and their frequencies.
    """
    word_counter = Counter()
    # Filter tweets by sentiment and update word frequencies
    df[df['sentiment'] == sentiment]['lem_word_count'].apply(lambda x: word_counter.update(x))
    return word_counter.most_common(n)

# Find the top 10 most frequent words for each sentiment

def lemmatize_pos(words):
    tags = nltk.pos_tag(words)
    lem_list = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tags]
    return (' ').join(lem_list)

def lemmatize_pos_for_word_count(words):
    tags = nltk.pos_tag(words)
    lem_list = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tags]
    return lem_list

In [51]:
# Process all files
tweets = {}
for file in files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r') as f:
        raw_text = f.read()
    # Clean the text and remove duplicates
    text = clean_text(raw_text)
    processed_text = list(set(text))
    # Save processed tweets in the dictionary
    key = file.replace("processed", "").replace(".csv", "").lower()
    tweets[key] = processed_text

In [52]:
    

# Output: A dictionary with cleaned and filtered tweets
data = []
for sentiment, tweet_list in tweets.items():
    for tweet in tweet_list:
        data.append((tweet, sentiment))

# Create a DataFrame
df = pd.DataFrame(data, columns=["text", "sentiment"])
df
df["text"] = df["text"].apply(normalize_repeated_letters)
# use nltk word tokenizer to split the cleaned text to tokens 
df["text"] = df["text"].apply(word_tokenize)
# #remove stop words to avoid noise
df["text"] = df["text"].apply(remove_stop_words)
#apply stemmatization on the list of words
global_word_count = Counter()


df["text"].apply(lambda x: global_word_count.update(x))

# print(global_word_count)

df['lemmatized_text'] = df["text"].apply(lemmatize_pos)

In [53]:
df['lem_word_count'] = df["text"].apply(lemmatize_pos_for_word_count)

positive_words = get_most_frequent_words(df, 'positive', 10)
negative_words = get_most_frequent_words(df, 'negative', 10)
neutral_words = get_most_frequent_words(df, 'neutral', 10)

# Display the results
print("Top Positive Words:", positive_words)
print("Top Negative Words:", negative_words)
print("Top Neutral Words:", neutral_words)

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['lemmatized_text'])  # Join tokens if tokenized

# Get the target variable (e.g., sentiment)
y = df['sentiment'] 

Top Positive Words: [('happy', 571), ('thanks', 94), ('want', 65), ('smile', 56), ('get', 56), ('great', 55), ('much', 54), ('follow', 53), ('love', 52), ('good', 45)]
Top Negative Words: [('unhappy', 682), ('get', 71), ('sad', 70), ('miss', 66), ('not', 62), ('want', 56), ('im', 55), ('dont', 53), ('like', 53), ('go', 51)]
Top Neutral Words: [('also', 78), ('epaper', 75), ('court', 65), ('say', 64), ('india', 54), ('govt', 46), ('minister', 45), ('not', 40), ('supreme', 39), ('take', 37)]


In [54]:
# df["word_count"] = df["text"].apply(count_words)

# print(df["word_count"])

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features

# Transform the text into TF-IDF features
# tfidf_features = tfidf_vectorizer.fit_transform(df['lemmatized_text'])
tfidf_features = tfidf_vectorizer.fit_transform(df['lemmatized_text'])

# X = tfidf_features  # TF-IDF features
# y = df['sentiment']  # Sentiment labels
# print(df.shape)
# print(df)
# # Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print("Training Data Shape:", X_train.shape)
# print("Testing Data Shape:", X_test.shape)

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Compare the true labels (y_test) with the predicted labels (y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100} %") 
# df

Classification Report:
              precision    recall  f1-score   support

    negative       0.99      0.89      0.94       180
     neutral       0.88      0.98      0.92       188
    positive       0.93      0.90      0.91       161

    accuracy                           0.93       529
   macro avg       0.93      0.92      0.93       529
weighted avg       0.93      0.93      0.93       529

Confusion Matrix:
[[161  11   8]
 [  1 184   3]
 [  1  15 145]]
Accuracy: 92.62759924385632 %


In [None]:

# Convert lemmatized_text back to a string for vectorizers
df['processed_text'] = df['lemmatized_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
    

In [None]:

# Generate TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])
y = df['sentiment'].apply(sentiment_to_numbers)
    

In [None]:

# Generate CountVectorizer features
count_vectorizer = CountVectorizer(max_features=5000)  # Limit to top 5000 features for efficiency
X_count = count_vectorizer.fit_transform(df['processed_text'])
    

In [None]:

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_count, X_test_count, _, _ = train_test_split(X_count, y, test_size=0.2, random_state=42)
    

In [None]:

# Train Logistic Regression with TF-IDF features
tfidf_model = LogisticRegression(max_iter=1000, random_state=42)
tfidf_model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)
print("TF-IDF Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Classification Report:\n", classification_report(y_test, y_pred_tfidf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tfidf))
    

In [None]:

# Train Logistic Regression with CountVectorizer features
count_model = LogisticRegression(max_iter=1000, random_state=42)
count_model.fit(X_train_count, y_train)

# Evaluate the model
y_pred_count = count_model.predict(X_test_count)
print("CountVectorizer Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_count))
print("Classification Report:\n", classification_report(y_test, y_pred_count))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_count))
    