In [104]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
from nltk.stem import PorterStemmer
from collections import Counter
import numpy as np
from autocorrect import Speller
import re
import os
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.preprocessing import StandardScaler
from scipy.sparse import issparse

nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
pd
lemmatizer = WordNetLemmatizer()
Spell = Speller()
stemmer = PorterStemmer()
folder_path = "./p00_tweets/"
files = ['processedPositive.csv', 'processedNeutral.csv', 'processedNegative.csv']
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rezzahra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rezzahra/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rezzahra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/rezzahra/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [105]:
def clean_text(text):
    # Convert to lowercase, remove numbers, quotes, and strip leading/trailing spaces
    text = re.sub(r"\d+", "", text.lower()).replace("'", "").replace('"', "")
    # Add newlines where a comma is not followed by a space
    text = re.sub(r",(?=\S)", "\n", text)
    # Split by lines and filter out short lines (less than 2 words)
    return [line.strip() for line in text.split("\n") if len(line.split()) >= 2]
    
def normalize_repeated_letters_and_remove_non_alpha_char(text):
    # Replace 3+ repeated letters with a single occurrence
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # e.g., "happyyyy" -> "happy"
    text = re.sub(r'[^a-zA-Z\s]', '', text)   # Remove non-alphabetic characters
    return text
    
negations = {"not", "no", "nor", "never"}

def remove_stop_words(words):
    cleaned_list = [word for word in words if word not in stop_words or word in negations]
    return cleaned_list
    
def count_words(tweet):
    return Counter(tweet)

def stemmatization(words):
    stem_list = [stemmer.stem(word) for word in words]
    return stem_list

def sentiment_to_numbers(text):
    if text == "positive":
        return 4
    elif text == "negative":
        return 0
    else:
        return 2

def get_wordnet_pos(tag):
    if tag.startswith('J'):  # Adjective
        return wordnet.ADJ
    elif tag.startswith('V'):  # Verb
        return wordnet.VERB
    elif tag.startswith('N'):  # Noun
        return wordnet.NOUN
    elif tag.startswith('R'):  # Adverb
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to count words for each sentiment category
# def get_most_frequent_words(df, sentiment, n=10):
#     """
#     Get the most frequent words in tweets of a specific sentiment.

#     Args:
#         df (DataFrame): The DataFrame containing 'text' and 'sentiment'.
#         sentiment (str): The sentiment category (e.g., 'positive').
#         n (int): The number of top words to return.

#     Returns:
#         List of tuples: Top `n` words and their frequencies.
#     """
#     word_counter = Counter()
#     # Filter tweets by sentiment and update word frequencies
#     df[df['sentiment'] == sentiment]['lem_word_count'].apply(lambda x: word_counter.update(x))
#     return word_counter.most_common(n)

# Find the top 10 most frequent words for each sentiment

def lemmatize_pos(words):
    # words = words.apply(word_tokenize)
    tags = nltk.pos_tag(words)
    lem_list = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tags]
    return lem_list

In [106]:
tweets = {}
for file in files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r') as f:
        raw_text = f.read()
    # Clean the text and remove duplicates
    text = clean_text(raw_text)
    processed_text = list(set(text))
    # Save processed tweets in the dictionary
    key = file.replace("processed", "").replace(".csv", "").lower()
    tweets[key] = processed_text

    

# Output: A dictionary with cleaned and filtered tweets
data = []
for sentiment, tweet_list in tweets.items():
    for tweet in tweet_list:
        data.append((tweet, sentiment))

# Create a DataFrame
df = pd.DataFrame(data, columns=["text", "sentiment"])

# df['lemmatized_text'] = df["text"].apply(lemmatize_pos)

# df['lem_word_count'] = df["text"].apply(lemmatize_pos_for_word_count)

# positive_words = get_most_frequent_words(df, 'positive', 10)
# negative_words = get_most_frequent_words(df, 'negative', 10)
# neutral_words = get_most_frequent_words(df, 'neutral', 10)

# # Display the results
# print("Top Positive Words:", positive_words)
# print("Top Negative Words:", negative_words)
# print("Top Neutral Words:", neutral_words)

In [107]:
def preprocess_text(text, technique):
    if "custom preprocessing" in technique:
        text = text.apply(normalize_repeated_letters)
    text = text.apply(word_tokenize)
    if "custom preprocessing" in technique:
        text = text.apply(remove_stop_words)
    # print(f"preprocessText: {text}")
    if technique == "tokenization":
        return text.apply(lambda x: ' '.join(x))
    elif technique == "stemming":
        return text.apply(lambda x: ' '.join(stemmatization(x)))
    elif technique == "lemmatization":
        return text.apply(lambda x: ' '.join(lemmatize_pos(x)))
    elif technique == "stemming + misspellings" or technique == "stemming + misspellings + custom preprocessing":
        stemmed = text.apply(lambda x: stemmatization(x))
        return text.apply(lambda x: ' '.join([Spell(word) for word in x]))
    elif technique == "lemmatization + misspellings" or technique == "lemmatization + misspellings + custom preprocessing":
        lemmatized = text.apply(lambda x: lemmatize_pos(x))
        return lemmatized.apply(lambda x: ' '.join([Spell(word) for word in x]))
    else:
        return text.apply(lambda x: ' '.join(x))

def prepare_features(text, method):
    if method == "binary":
        vectorizer = CountVectorizer(binary=True, max_features=5000)
    elif method == "word_counts":
        vectorizer = CountVectorizer(max_features=5000)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(max_features=5000)
    return vectorizer.fit_transform(text)

# def evaluate_model(X, y):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#     model = LogisticRegression(max_iter=5000, random_state=42)
#     model.fit(X_train, y_train)
#     param_grid = [
#     {'penalty':['l1','l2','elasticnet','none'],
#     'C' : np.logspace(-4,4,20),
#     'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
#     'max_iter'  : [100,1000,2500,5000]
#     }
#     ]
#     clf = GridSearchCV(model,param_grid = param_grid, cv = 10, verbose=True,n_jobs=-1)
#     clf
#     best_clf = clf.fit(X,y)
#     best_clf.best_estimator_
#     print(f'Accuracy - : {best_clf.score(x,y):.3f}')
#     y_pred = model.predict(X_test)
    
#     return accuracy_score(y_test, y_pred)
# #     param_grid = {
# #     'C': [0.1, 1, 10],
# #     'kernel': ['linear', 'rbf'],
# #     'gamma': [0.001, 0.01, 0.1]
# #     }

# # # Initialize GridSearchCV
# #     grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# # # Train the model
# # #     grid_search.fit(X_train, y_train)

# # # # Print the best parameters
# # #     print("Best Hyperparameters:", grid_search.best_params_)

# # # # Evaluate the model with the best parameters
# # #     best_model = grid_search.best_estimator_
# # #     accuracy = best_model.score(X_test, y_test)
# # #     return accuracy    

def evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LogisticRegression(max_iter=1000,random_state=42)  # saga is better for sparse data
    grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
    logreg=LogisticRegression()
    logreg_cv=GridSearchCV(logreg,grid,cv=10)
    logreg_cv.fit(X_train,y_train)
    print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
    print("accuracy :",logreg_cv.best_score_)
    return accuracy_score(y_test, y_pred)

    
def test_combinations(df):
    results = []
    techniques = ["tokenization", "stemming", "lemmatization", "stemming + misspellings", "lemmatization + misspellings","lemmatization + misspellings + custom preprocessing","stemming + misspellings + custom preprocessing"]
                  
    methods = ["binary", "word_counts", "tfidf"]
    
    for technique in techniques:
        for method in methods:
            print(f"Testing: {technique} + {method}")
            processed_text = preprocess_text(df["text"], technique)
            X = prepare_features(processed_text, method)
            y = df['sentiment'].apply(sentiment_to_numbers)
            accuracy = evaluate_model(X, y)
            results.append((technique, method, accuracy))
            print(f"Accuracy: {accuracy:.4f}")
    
    return results



In [108]:
# Run the Test

results = test_combinations(df)

# Display Results
results_df = pd.DataFrame(results, columns=["Preprocessing Technique", "Feature Preparation", "Accuracy"])
print(results_df)





Testing: tokenization + binary


NameError: name 'x_train' is not defined