In [20]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import precision_score, recall_score, f1_score
import re
import emoji

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def remove_html(text):
    return re.sub(r'<.*?>', "", text)

def remove_punctuation(text):
    punctuation = """!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""  
    return text.translate(str.maketrans(punctuation, ' ' * len(punctuation)))

def remove_numbers(text):
    return re.sub("\d+", "", text)

def remove_emoji(text):
    text = emoji.demojize(text)
    return re.sub(r'(:[a-zA-Z_]+:)', '', text)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def preprocess_text(text):
    text = text.lower()
    text = decontracted(text)
    text = remove_html(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_emoji(text)
    text = remove_stopwords(text)
    return text

def calculate_accuracy(preprocessed_text, ground_truth):
    preprocessed_words = preprocessed_text.split()
    ground_truth_words = ground_truth.split()
    matches = sum(1 for word in preprocessed_words if word in ground_truth_words)
    accuracy = matches / len(ground_truth_words) * 100

    # Create a set of unique words
    all_words = list(set(preprocessed_words + ground_truth_words))
    
    # Create binary vectors
    preprocessed_vector = [1 if word in preprocessed_words else 0 for word in all_words]
    ground_truth_vector = [1 if word in ground_truth_words else 0 for word in all_words]
    
    # Calculate precision, recall, and F1 score
    precision = precision_score(ground_truth_vector, preprocessed_vector)
    recall = recall_score(ground_truth_vector, preprocessed_vector)
    f1 = f1_score(ground_truth_vector, preprocessed_vector)

    return accuracy, precision, recall, f1

# Example usage
text = "This text has <html tags>, punctuation?! and emojis. I've removed all of them."
ground_truth = "this text has html tags punctuation and emojis i have removed all of them"

cleaned_text = preprocess_text(text)
print(f"Cleaned Text: {cleaned_text}")

accuracy, precision, recall, f1 = calculate_accuracy(cleaned_text, ground_truth)
print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Assuming df is your DataFrame and 'comment' is the column you want to clean
# df['cleaned_texhttp://localhost:8888/notebooks/Documents/Sarcasm%20Detection/decision.ipynb#t'] = df['comment'].apply(preprocess_text)
# df.head()


Cleaned Text: text punctuation emojis removed
Accuracy: 28.57%
Precision: 1.00
Recall: 0.29
F1 Score: 0.44


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hetvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
