In [12]:
# Cell 1: Install and Import Libraries
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.metrics.distance import edit_distance
from collections import Counter
from autocorrect import Speller
import re
import os
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Cell 2: Download NLTK Datasets (Only if Not Already Downloaded)
import nltk

# Check if NLTK datasets are already downloaded
nltk_resources = ["stopwords", "punkt", "wordnet", "averaged_perceptron_tagger"]

for resource in nltk_resources:
    try:
        # Check if the resource is already downloaded
        nltk.data.find(f"corpora/{resource}")
    except LookupError:
        # If not, download it quietly
        nltk.download(resource, quiet=True)

lemmatizer = WordNetLemmatizer()
Spell = Speller()
stemmer = PorterStemmer()
folder_path = "/Users/reroreo1/Desktop/p00_tweets/"
files = ['processedPositive.csv', 'processedNeutral.csv', 'processedNegative.csv']
stop_words = set(stopwords.words("english"))

# Cell 3: Define Preprocessing Functions
def clean_text(text):
    text = re.sub(r"\d+", "", text.lower()).replace("'", "").replace('"', "")
    text = re.sub(r",(?=\S)", "\n", text)
    return [line.strip() for line in text.split("\n") if len(line.split()) >= 2]

def normalize_repeated_letters(text):
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return Spell(text)

negations = {"not", "no", "nor", "never"}

def get_most_frequent_words(df, sentiment, n=10):
    """
    Get the most frequent words in tweets of a specific sentiment.

    Args:
        df (DataFrame): The DataFrame containing 'text' and 'sentiment'.
        sentiment (str): The sentiment category (e.g., 'positive').
        n (int): The number of top words to return.

    Returns:
        List of tuples: Top `n` words and their frequencies.
    """
    word_counter = Counter()
    # Filter tweets by sentiment and update word frequencies
    df[df['sentiment'] == sentiment]['lem_word_count'].apply(lambda x: word_counter.update(x))
    return word_counter.most_common(n)

def remove_stop_words(words):
    return [word for word in words if word not in stop_words or word in negations]

def stemmatization(words):
    return [stemmer.stem(word) for word in words]

def sentiment_to_numbers(text):
    if text == "positive":
        return 4
    elif text == "negative":
        return 0
    else:
        return 2

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_pos(words):
    tags = nltk.pos_tag(words)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tags]

def lemmatize_pos_for_word_count(words):
    tags = nltk.pos_tag(words)
    lem_list = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tags]
    return lem_list

# Cell 4: Process All Files
tweets = {}
for file in files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r') as f:
        raw_text = f.read()
    text = clean_text(raw_text)
    processed_text = list(set(text))
    key = file.replace("processed", "").replace(".csv", "").lower()
    tweets[key] = processed_text

# Cell 5: Prepare DataFrame
data = []
for sentiment, tweet_list in tweets.items():
    for tweet in tweet_list:
        data.append((tweet, sentiment))

df = pd.DataFrame(data, columns=["text", "sentiment"])
df["text"] = df["text"].apply(normalize_repeated_letters)
df["text"] = df["text"].apply(word_tokenize)
df["text"] = df["text"].apply(remove_stop_words)
df['lemmatized_text'] = df["text"].apply(lemmatize_pos)
print("df['lemmatized_text'] ====== " , df['lemmatized_text'])

# Cell 6: Analyze Most Frequent Words
df['lem_word_count'] = df["text"].apply(lemmatize_pos_for_word_count)

positive_words = get_most_frequent_words(df, 'positive', 10)
negative_words = get_most_frequent_words(df, 'negative', 10)
neutral_words = get_most_frequent_words(df, 'neutral', 10)

print("Top Positive Words:", positive_words)
print("Top Negative Words:", negative_words)
print("Top Neutral Words:", neutral_words)

# Cell 7: Train and Evaluate Logistic Regression Model
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
df['lem_text_str'] = df['lemmatized_text'].apply(lambda x: ' '.join(x))
tfidf_features = tfidf_vectorizer.fit_transform(df['lem_text_str'])
y = df['sentiment'].apply(sentiment_to_numbers)

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, y, test_size=0.2, random_state=42)
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100} %")

# Cell 8: Test All Preprocessing and Feature Preparation Combinations
def preprocess_text(text, technique):
    text = text.apply(normalize_repeated_letters)
    text = text.apply(word_tokenize)
    text = text.apply(remove_stop_words)
    
    if technique == "tokenization":
        return text.apply(lambda x: ' '.join(x))
    elif technique == "stemming":
        return text.apply(lambda x: ' '.join(stemmatization(x)))
    elif technique == "lemmatization":
        return text.apply(lambda x: ' '.join(lemmatize_pos(x)))
    elif technique == "stemming+lemmatization":
        stemmed = text.apply(lambda x: stemmatization(x))
        return stemmed.apply(lambda x: ' '.join(lemmatize_pos(x)))
    elif technique == "misspellings":
        return text.apply(lambda x: ' '.join([Spell(word) for word in x]))
    elif technique == "lemmatization+misspellings":
        lemmatized = text.apply(lambda x: lemmatize_pos(x))
        return lemmatized.apply(lambda x: ' '.join([Spell(word) for word in x]))
    else:
        return text.apply(lambda x: ' '.join(x))

def prepare_features(text, method):
    if method == "binary":
        vectorizer = CountVectorizer(binary=True, max_features=5000)
    elif method == "word_counts":
        vectorizer = CountVectorizer(max_features=5000)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(max_features=5000)
    return vectorizer.fit_transform(text)

def evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def test_combinations(df):
    results = []
    techniques = ["tokenization", "stemming", "lemmatization", "stemming+lemmatization", "misspellings", "lemmatization+misspellings"]
    methods = ["binary", "word_counts", "tfidf"]
    
    for technique in techniques:
        for method in methods:
            print(f"Testing: {technique} + {method}")
            processed_text = preprocess_text(df["text"], technique)
            X = prepare_features(processed_text, method)
            y = df['sentiment'].apply(sentiment_to_numbers)
            accuracy = evaluate_model(X, y)
            results.append((technique, method, accuracy))
            print(f"Accuracy: {accuracy:.4f}")
    
    return results

# Run the Test
results = test_combinations(df)

# Display Results
results_df = pd.DataFrame(results, columns=["Preprocessing Technique", "Feature Preparation", "Accuracy"])
print(results_df)

1         [lunchtime, flinthook, game, much, personality]
2                                     [hype, real, happy]
3       [knight, day, jewellery, end, sundaywhy, not, ...
4                             [thats, law, nature, happy]
                              ...                        
2640                                       [whaddup, cry]
2641    [want, give, everything, make, happy, see, smi...
2642    [dont, want, tell, fellow, comm, student, dont...
2643                                [year, haha, unhappy]
2644    [happy, birthday, katrina, miss, though, unhap...
Name: lemmatized_text, Length: 2645, dtype: object
Top Positive Words: [('happy', 573), ('thanks', 95), ('want', 65), ('get', 57), ('great', 56), ('follow', 55), ('much', 54), ('smile', 54), ('love', 52), ('good', 46)]
Top Negative Words: [('unhappy', 688), ('get', 71), ('sad', 69), ('miss', 66), ('not', 63), ('want', 60), ('im', 55), ('dont', 53), ('like', 53), ('go', 53)]
Top Neutral Words: [('also', 78), ('epaper'

TypeError: expected string or bytes-like object, got 'list'