# Capstone Project: Supervised Modeling

## Data ingestion

In [None]:
# Importing necessary libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import emoji
import re
from html import unescape
from emoji import demojize
from IPython.display import display, Markdown
import numpy as np
from html import unescape

# Modeling libraries
from langdetect import detect
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from gensim.models import Word2Vec, FastText
from wordcloud import WordCloud
from collections import Counter
import networkx as nx
from pyvis.network import Network
import community as community_louvain
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# Others
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import os
from joblib import dump


# Suppress only UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [None]:
# Load the dataset
df_original = pd.read_csv('sample_duo_data.csv')
df_original['Update'] = 0

In [None]:
df = df_original

## Data Exploration

In [None]:
df.shape

In [None]:
column_names = df.columns
for i in column_names:
    x = len(df[i].unique())
    print('Count of Unique Values for ' + str(i))
    print(x)
    print('')

In [None]:
# Function to display the body text as Markdown formatted text
def display_as_paragraph(index, df=df, column_name='Body'):
    """
    Display the text from the specified index in the given DataFrame as a formatted paragraph.

    Parameters:
    - index: int
        The index of the row in the DataFrame to display.
    - df: pandas.DataFrame, optional
        The DataFrame containing the text data. Default is the global variable `df`.
    - column_name: str, optional
        The name of the column in the DataFrame containing the text data. Default is 'Body'.

    Returns:
    None
    """
    body_text = df.loc[index, column_name]
    # Convert newlines into Markdown line breaks for better readability
    formatted_text = body_text.replace('<br>', '\n\n')
    display(Markdown(formatted_text))

## Data Partitioning 
In preparation for labelling

In [None]:
# Extracting index as a column
df['Original Index'] = df.index

In [None]:
# Creating a reference DataFrame
df_reference = df.copy()

In [None]:
# Now to remove duplicates while keeping the reference info, we first need to sort by 'Body' to ensure consistency
df.sort_values(by='Body', inplace=True)

# Dropping duplicates based on 'Body' and keeping the first occurrence
df_clean = df.drop_duplicates(subset=['Body'], keep='first').reset_index(drop=True)
df_clean['split_index'] = df_clean.index

In [None]:
# Read the label data 
df_label = pd.read_csv('Final_Label_Data.csv')
df_deduplicated = pd.merge(df_clean, df_label, left_on='split_index', right_on='data_row.global_key', how='inner')

In [None]:
columns_to_remove = [
'attachments',
'metadata_fields',
'data_row.id',
'data_row.details.dataset_id',
'data_row.details.dataset_name',
'data_row.details.created_at',
'data_row.details.updated_at',
'data_row.details.last_activity_at',
'data_row.details.created_by',
'media_attributes.mime_type',
'projects.clsqaik7009uu07wselyi44w5.name',
'projects.clsqaik7009uu07wselyi44w5.labels',
'projects.clsqaik7009uu07wselyi44w5.project_details.ontology_id',
'projects.clsqaik7009uu07wselyi44w5.project_details.task_name',
'projects.clsqaik7009uu07wselyi44w5.project_details.batch_id',
'projects.clsqaik7009uu07wselyi44w5.project_details.batch_name',
'projects.clsqaik7009uu07wselyi44w5.project_details.workflow_status',
'projects.clsqaik7009uu07wselyi44w5.project_details.priority',
'projects.clsqaik7009uu07wselyi44w5.project_details.consensus_expected_label_count',
'projects.clsqaik7009uu07wselyi44w5.project_details.workflow_history',
'projects.clsqaik7009uu07wselyi44w5.project_details.selected_label_id'
]

# To modify the original DataFrame
df_deduplicated.drop(columns=columns_to_remove, axis=1, inplace=True)

## Data Cleaning

In [None]:
# Define a pattern to match all emojis
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"  # Enclosed characters
                           u"⏭"  # Additional symbols identified in your text
                           u"↪"  # Additional symbols identified in your text
                           "]+", flags=re.UNICODE)

# Function to extract emojis from a string using regex
def extract_emojis(s):
    """
    Extracts emojis from a given string.

    Args:
        s (str): The input string from which emojis will be extracted.

    Returns:
        str: A space-separated string containing all the emojis found in the input string.
             If the input is not a string, an empty string is returned.
    """
    if isinstance(s, str):
        return ' '.join(emoji_pattern.findall(s))
    else:
        return ''  # Return empty string if the input is not a string

# Function to translate emojis to words and ensure spaces around them
def translate_emojis(text):
    """
    Translates emojis in the given text to their corresponding words.

    Args:
        text (str): The text to be translated.

    Returns:
        str: The translated text with emojis replaced by words.
    """
    if not isinstance(text, str):
        return text  # Return the original value if it's not a string
    
    def replace_with_words(match):
        # Translate emoji to words
        emoji_word = demojize(match.group(0), delimiters=(" ", " "))
        # Replace underscores with spaces and enclose in parentheses
        emoji_word_clean = " (" + emoji_word.replace("_", " ").strip() + ") "
        return emoji_word_clean
    
    return emoji_pattern.sub(replace_with_words, text)

# Decoding Emojis
def decode_emojis(text):
    """
    Decode emojis in the given text.

    Args:
        text (str): The text to decode emojis from.

    Returns:
        str: The text with emojis decoded.

    """
    # Check if the text is a string; if not, return it as is
    if not isinstance(text, str):
        return text
    return emoji.demojize(text, delimiters=("", ""))

In [None]:
# Function to clean text
def clean_text(text):
    """
    Cleans the given text by removing HTML tags, decoding HTML entities, removing emojis, and converting to lowercase.

    Parameters:
    text (str): The text to be cleaned.

    Returns:
    str: The cleaned text.
    """
    if isinstance(text, str):
        # Remove HTML tags
        text_clean = re.sub('<.*?>', '', text)
        # Decode HTML entities
        text_clean = unescape(text_clean)
        # Translate emojis
        text_clean = translate_emojis(text_clean)
        # Convert to lowercase
        text_clean = text_clean.lower()
        return text_clean
    else:
        # Return empty string if the input is not a string
        return '' if text is None else text

In [None]:
# Cleaning HTML Tags
def clean_html_tags(text):
    """
    Cleans HTML tags and decodes HTML entities from the given text.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with HTML tags removed and HTML entities decoded.
    """
    if not isinstance(text, str):
        # Return the input as-is if it's not a string
        return text
    # Remove HTML tags
    clean_text = re.sub('<.*?>', '', text)
    # Decode HTML entities
    clean_text = unescape(clean_text)
    return clean_text

In [None]:
# Lenguage Detection Function 
def detect_language(text):
    """
    Detects the language of the given text.

    Parameters:
    text (str): The text to detect the language of.

    Returns:
    str: The detected language of the text, or 'unknown' if the detection fails.
    """
    try:
        return detect(text)
    except:
        return 'unknown'

In [None]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text_advanced(text, method="lemmatization", remove_special_chars=True, remove_numbers=True, use_custom_stopwords=False, custom_stopwords=set()):
    """
    Clean the given text using advanced techniques including optional removal of special characters and numbers,
    and utilizing custom stopwords.

    Args:
        text (str): The input text to be cleaned.
        method (str, optional): The cleaning method to be used ('stemming' or 'lemmatization'). Defaults to 'lemmatization'.
        remove_special_chars (bool, optional): Whether to remove special characters. Defaults to True.
        remove_numbers (bool, optional): Whether to remove numbers. Defaults to True.
        use_custom_stopwords (bool, optional): Whether to use custom stopwords. Defaults to False.
        custom_stopwords (set, optional): A set of custom stopwords. Defaults to an empty set.

    Returns:
        str: The cleaned text.
    """
    if not isinstance(text, str):
        return text  # Return the original value if it's not a string
    # Optionally remove special characters and numbers
    if remove_special_chars:
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    if remove_numbers:
        text = re.sub(r'\d+', '', text)

    # Lowercasing
    text = text.lower()

    # Tokenization
    words = word_tokenize(text)

    # Stopwords
    all_stopwords = set(stopwords.words('english'))
    if use_custom_stopwords:
        all_stopwords = all_stopwords.union(custom_stopwords)

    # Stemming or Lemmatization
    if method == "stemming":
        cleaned_words = [stemmer.stem(word) for word in words if word not in all_stopwords]
    # Default to lemmatization
    else:  
        cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in all_stopwords]

    return ' '.join(cleaned_words)

In [None]:
# Applying functions to DataFrame
df_deduplicated['Body_Emojis'] = df_deduplicated['Body'].apply(extract_emojis)
df_deduplicated['Title_Emojis'] = df_deduplicated['Title'].apply(extract_emojis)

In [None]:
df_deduplicated['Body_Noemojis'] = df_deduplicated['Body'].apply(translate_emojis)
df_deduplicated['Title_Noemojis'] = df_deduplicated['Title'].apply(translate_emojis)

In [None]:
df_deduplicated['Body_Clean_Basics'] = df_deduplicated['Body'].apply(clean_text)
df_deduplicated['Title_Clean_Basics'] = df_deduplicated['Title'].apply(clean_text)

In [None]:
df_deduplicated['Body_HTML'] = df_deduplicated['Body'].apply(clean_html_tags)
df_deduplicated['Title_HTML'] = df_deduplicated['Title'].apply(clean_html_tags)

In [None]:
#df_deduplicated['language'] = df_deduplicated['Body_HTML'].apply(detect_language)

In [None]:
df_deduplicated['Body_Clean'] = df_deduplicated['Body_Clean_Basics'].apply(lambda x: clean_text_advanced(x, 
                                                                                                         method="lemmatization",
                                                                                                         remove_special_chars=False, 
                                                                                                         remove_numbers=False, 
                                                                                                         use_custom_stopwords=False, 
                                                                                                         custom_stopwords=set()))
df_deduplicated['Title_Clean'] = df_deduplicated['Title_Clean_Basics'].apply(lambda x: clean_text_advanced(x, 
                                                                                                         method="lemmatization",
                                                                                                         remove_special_chars=False, 
                                                                                                         remove_numbers=False, 
                                                                                                         use_custom_stopwords=False, 
                                                                                                         custom_stopwords=set()))

In [None]:
df_deduplicated['Clean_Text'] = df_deduplicated['Title_Clean'].astype(str) + '\n\n' + df_deduplicated['Body_Clean'].astype(str)
df_deduplicated['Clean_Text_Basic'] = df_deduplicated['Title_Clean_Basics'].astype(str) + '\n\n' + df_deduplicated['Body_Clean_Basics'].astype(str)
df_deduplicated['All_Emojis'] = df_deduplicated['Title_Emojis'].astype(str) + '\n\n' + df_deduplicated['Body_Emojis'].astype(str)

## Classifier Modeling (Classic ML)

In [None]:
# Example placeholder for keyword matching
keywords = ['two girls special', 'two girl special','duo special', '2 girls', 'double trouble', 'double the fun'] #Pretty conservative list of keywords

In [None]:
def check_keywords(text):
    for keyword in keywords:
        if keyword in text:
            return 1
    return 0

df_deduplicated['Weak_Label'] = df_deduplicated['Clean_Text_Basic'].apply(check_keywords)
df_deduplicated['Weak_Label'].value_counts()

In [None]:
df_deduplicated['Label'] = df_deduplicated.apply(lambda row: 1 if row['Multiple People or Not'] == 'Yes' and row['Massage Parlor/Asian Agency or Not'] == 'No' else 0, axis=1)
df_deduplicated['Label'].value_counts()

In [None]:
# Save the DataFrame to a CSV file
output_csv_path = 'Fine_Tune_Data.csv'
df_deduplicated.to_csv(output_csv_path, index=False)

In [None]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_deduplicated['Clean_Text'], df_deduplicated['Label'],test_size=0.3, random_state=42)

### Feature Engineering

#### Bag of Words (BoW) Classification

In [None]:
# Vectorizing the text data
vectorizer = CountVectorizer() #Bag of Words
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Training a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Making predictions and evaluating the model
y_pred = clf.predict(X_test_counts)
print(classification_report(y_test, y_pred))

#### TF-IDF

In [None]:
# Transforming count matrix to a normalized tf-idf representation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Re-training the classifier with TF-IDF features
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train)

# Making predictions and evaluating the model
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_tfidf))

#### N-Grams

In [None]:
# Vectorizing the text data with unigrams, bigrams, and trigrams using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
X_train_tfidf_gram = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf_gram = tfidf_vectorizer.transform(X_test)

# Training the Naive Bayes classifier with the enhanced TF-IDF features
clf_tfidf_gram = MultinomialNB()
clf_tfidf_gram.fit(X_train_tfidf_gram, y_train)

# Making predictions and evaluating the model
y_pred_tfidf_gram = clf_tfidf_gram.predict(X_test_tfidf_gram)
print(classification_report(y_test, y_pred_tfidf_gram))

#### Savings Models

In [None]:
models_dir = "Models"  # Folder in your repository where you want to save models

# Function to save the model
def save_model(model, model_name, models_directory=models_dir):
    if not os.path.isdir(models_directory):
        os.makedirs(models_directory)  # Create the Models directory if it doesn't exist
    dump(model, f"{models_directory}/{model_name}.joblib")

In [None]:
# Save models after training
save_model(clf, 'naive_bayes_bow')
save_model(clf_tfidf, 'naive_bayes_tfidf')
save_model(clf_tfidf_gram, 'naive_bayes_tfidf_gram')

### Class Imbalance

In [None]:
# Prepare the data
# Vectorization and Model Pipeline using imblearn's pipeline
pipeline = make_pipeline_imblearn(
    TfidfVectorizer(ngram_range=(1, 3)),
    SMOTE(random_state=42),
    MultinomialNB()
)

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# If you want to save the pipeline model
save_model(pipeline, 'smote_naive_bayes_tfidf')

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1']))


### Multiple Models

In [None]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_deduplicated['Clean_Text'], df_deduplicated['Label'], test_size=0.3, random_state=42)

# Define vectorization methods
vectorization_methods = {
    'BoW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer(),
    'N-Grams': TfidfVectorizer(ngram_range=(1, 3))
}

classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": MultinomialNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # For XGBoost, avoid warnings
}
# Initialize an empty DataFrame to store results
results_balanced = pd.DataFrame(columns=["Feature Engineering", "Classifier", "Overall Accuracy", "Class 1 Precision", "Class 1 Recall", "Class 1 F1-Score"])

# Loop through each vectorization method
for vec_name, vectorizer in vectorization_methods.items():
    # Apply vectorization
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Balancing the dataset using SMOTE
    smote = SMOTE(random_state=42)
    X_train_vec_smote, y_train_smote = smote.fit_resample(X_train_vec, y_train)
    
    # Loop through classifiers to train and evaluate
    for clf_name, clf in classifiers.items():
        # Train the model using the balanced dataset
        clf.fit(X_train_vec_smote, y_train_smote)
        
        # Predict on the test set
        y_pred = clf.predict(X_test_vec)

        save_model(clf, f"{clf_name}_{vec_name}")
        
        # Calculate overall accuracy and metrics for class 1
        overall_accuracy = accuracy_score(y_test, y_pred)
        
        # Calculate precision, recall, f1-score specifically for class 1
        precision, recall, f1, _ = score(y_test, y_pred, labels=[1], average='binary')
        
        # Create a temporary DataFrame for the current iteration's results
        temp_df = pd.DataFrame({
            "Feature Engineering": [vec_name],
            "Classifier": [clf_name],
            "Overall Accuracy": [overall_accuracy],
            "Class 1 Precision": [precision],
            "Class 1 Recall": [recall],
            "Class 1 F1-Score": [f1]
        })
        
        # Concatenate with the main results DataFrame
        results_balanced = pd.concat([results_balanced, temp_df], ignore_index=True)

# Sort results by Class 1 F1-Score for comparison
results_balanced.sort_values(by="Class 1 F1-Score", ascending=False, inplace=True)


In [None]:
display(results_balanced)

In [None]:
# Initialize an empty DataFrame to store results
results = pd.DataFrame(columns=["Feature Engineering", "Classifier", "Overall Accuracy", "Class 1 Precision", "Class 1 Recall", "Class 1 F1-Score"])

# Loop through each vectorization method
for vec_name, vectorizer in vectorization_methods.items():
    # Apply vectorization
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # Loop through classifiers to train and evaluate
    for clf_name, clf in classifiers.items():
        # Train the model using the original dataset
        clf.fit(X_train_vec, y_train)
        
        # Predict on the test set
        y_pred = clf.predict(X_test_vec)
        
        # Calculate overall accuracy and metrics for class 1
        overall_accuracy = accuracy_score(y_test, y_pred)
        
        # Calculate precision, recall, f1-score specifically for class 1
        precision, recall, f1, _ = score(y_test, y_pred, labels=[1], average='binary')
        
        # Create a temporary DataFrame for the current iteration's results
        temp_df = pd.DataFrame({
            "Feature Engineering": [vec_name],
            "Classifier": [clf_name],
            "Overall Accuracy": [overall_accuracy],
            "Class 1 Precision": [precision],
            "Class 1 Recall": [recall],
            "Class 1 F1-Score": [f1]
        })
        
        # Concatenate with the main results DataFrame
        results = pd.concat([results, temp_df], ignore_index=True)

# Sort results by Class 1 F1-Score for comparison
results.sort_values(by="Class 1 F1-Score", ascending=False, inplace=True)

In [None]:
display(results)

#### Embeddings Improvement

In [None]:
# Directory for models and embeddings
models_embeddings_dir = 'Models_Embeddings'
os.makedirs(models_embeddings_dir, exist_ok=True)

def save_model_embeddings(model, model_name, method, is_embedding=False):
    filename = os.path.join(models_embeddings_dir, model_name)
    if is_embedding:
        if method in ['Word2Vec', 'FastText', 'GloVe']:
            model.save(f'{filename}.model')
        elif method in ['BERT', 'USE']:
        # For SentenceTransformer or TensorFlow Hub models, saving with joblib or similar is not straightforward
        # Consider saving configurations or necessary info to recreate the model instead
            print(f"Model type '{method}' cannot be directly saved with joblib. Consider manually saving model components.")
    else:
        dump(model, f'{filename}.joblib')

In [None]:
# Define a function to generate embeddings
def generate_embeddings(method, texts):
    if method == 'Word2Vec':
        model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
        embeddings = np.array([np.mean([model.wv[word] for word in text.split() if word in model.wv] or [np.zeros(100)], axis=0) for text in texts])
    elif method == 'FastText':
        model = FastText(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
        embeddings = np.array([np.mean([model.wv[word] for word in text.split() if word in model.wv] or [np.zeros(100)], axis=0) for text in texts])
    elif method == 'GloVe':
        glove_input_file = 'glove.6B.100d.txt'  # Specify the correct path to the GloVe file
        word2vec_output_file = 'glove.6B.100d.txt.word2vec'
        glove2word2vec(glove_input_file, word2vec_output_file)
        model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
        embeddings = np.array([np.mean([model[word] for word in text.split() if word in model] or [np.zeros(100)], axis=0) for text in texts])
    elif method == 'BERT':
        model = SentenceTransformer('bert-base-nli-mean-tokens')
        embeddings = model.encode(texts)
    elif method == 'USE':
        model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
        embeddings = np.array(model(texts))
    
    # Save the model or embeddings
    save_model_embeddings(model, f'{method}_model', method, is_embedding=True)

    return embeddings

In [None]:
classifiers_embeddings = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # For XGBoost, avoid warnings
}

# Results DataFrame
results_embeddings = pd.DataFrame()

# Assuming 'Clean_Text' and 'Label' are columns in your DataFrame
texts = df_deduplicated['Clean_Text'].tolist()
labels = df_deduplicated['Label'].tolist()

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# Loop over embedding methods
for method in ['Word2Vec', 'FastText', 'GloVe', 'BERT', 'USE']:
    # Generate embeddings
    X_train_embed = generate_embeddings(method, X_train)
    X_test_embed = generate_embeddings(method, X_test)
    
    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_vec_smote, y_train_smote = smote.fit_resample(np.array(X_train_embed), y_train)

    for clf_name, clf in classifiers_embeddings.items():
        clf.fit(X_train_vec_smote, y_train_smote)
        y_pred = clf.predict(np.array(X_test_embed))

        # Calculate metrics
        overall_accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = score(y_test, y_pred, labels=[1], average='binary')

        # Temporary DataFrame for the current iteration's results
        temp_df = pd.DataFrame({
            "Embedding": [method],
            
            "Classifier": [clf_name],
            "Overall Accuracy": [overall_accuracy],
            "Class 1 Precision": [precision],
            "Class 1 Recall": [recall],
            "Class 1 F1-Score": [f1]
        })

        # Concatenate with the main results DataFrame
        results_embeddings = pd.concat([results_embeddings, temp_df], ignore_index=True)

        # Save the trained classifier
        save_model(clf, f'{method}_{clf_name}_classifier', models_directory = 'Model_Embeddings')

In [None]:
# Display sorted results
results_embeddings.sort_values(by="Class 1 F1-Score", ascending=False, inplace=True)
display(results_embeddings)

#### Performance Comparison

In [None]:
def plot_performance(df):
    # Define the metrics you want to plot
    #metrics = ['Overall Accuracy', 'Class 1 Precision', 'Class 1 Recall', 'Class 1 F1-Score']
    metrics = ['Class 1 Recall', 'Class 1 F1-Score']
    
    for metric in metrics:
        fig = px.bar(df, x='Embedding', y=metric, color='Classifier', barmode='group',
                     title=f'{metric} by Embedding and Classifier',
                     category_orders={"Embedding": ["Word2Vec", "GloVe", "BERT", "USE", "FastText"]})
        fig.update_layout(xaxis_title="Embedding",
                          yaxis_title=metric,
                          legend_title="Classifier")
        fig.show()

In [None]:
results.rename(columns={'Feature Engineering': 'Embedding'}, inplace=True)
results_balanced.rename(columns={'Feature Engineering': 'Embedding'}, inplace=True)

In [None]:
# Add a column to each DataFrame to indicate the type of run
results['Run Type'] = 'Without SMOTE'
results_balanced['Run Type'] = 'With SMOTE'
results_embeddings['Run Type'] = 'Advanced Embeddings with SMOTE'

# Concatenate all three DataFrames
combined_df = pd.concat([results, results_balanced, results_embeddings])

# Create a unique identifier for each Embedding and Classifier combination
combined_df['Combination'] = combined_df['Embedding'] + ' + ' + combined_df['Classifier']

melted_df = combined_df.melt(id_vars=['Embedding', 'Classifier', 'Run Type'], 
                             value_vars=['Overall Accuracy', 'Class 1 Precision', 'Class 1 Recall', 'Class 1 F1-Score'],
                             var_name='Metric', value_name='Performance Value')

# Create a unique identifier for each Embedding and Classifier combination
melted_df['Combination'] = melted_df['Embedding'] + ' + ' + melted_df['Classifier'] + ' (' + melted_df['Run Type'] + ')'

# Define the desired metric order
metric_order = ['Overall Accuracy', 'Class 1 Precision', 'Class 1 Recall', 'Class 1 F1-Score']

# Ensure 'Metric' column is ordered correctly
melted_df['Metric'] = pd.Categorical(melted_df['Metric'], categories=metric_order, ordered=True)

# Sort melted_df by 'Class 1 Recall' to identify top 6 models
top_recall_combinations = melted_df[melted_df['Metric'] == 'Class 1 Recall']\
    .sort_values(by='Performance Value', ascending=False)\
    .head(6)['Combination'].unique()

# Define a list of unique colors for the top 6 lines
top_colors = ['#E63946', '#F4A261', '#2A9D8F', '#264653', '#E9C46A', '#F4E76E']

# Initialize an empty figure
fig = go.Figure()

# Add traces for the top 6 models with unique colors
for i, combination in enumerate(top_recall_combinations):
    df_filtered = melted_df[melted_df['Combination'] == combination]
    fig.add_trace(go.Scatter(x=df_filtered['Metric'], y=df_filtered['Performance Value'],
                             name=combination, mode='lines',
                             line=dict(color=top_colors[i], width=3)))  # Use unique color and slightly thicker lines

# Then, add traces for the rest of the models in dark gray
for combination in melted_df['Combination'].unique():
    if combination not in top_recall_combinations:
        df_filtered = melted_df[melted_df['Combination'] == combination]
        fig.add_trace(go.Scatter(x=df_filtered['Metric'], y=df_filtered['Performance Value'],
                                 name=combination, mode='lines',
                                 line=dict(color='darkgray', width=2)))  # Standard lines in dark gray

# Customize layout
fig.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    'title': {
        'text': "<b>Performance Across Different Metrics and Runs</b>",
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 24, 'family': "Arial", 'color': "black"}
    },
    'xaxis_title': '<b>Metric</b>',
    'yaxis_title': '<b>Performance Value</b>',
    'xaxis': {'categoryorder': 'array', 'categoryarray': metric_order},
    'xaxis_title_font': {'size': 18, 'family': "Arial"},
    'yaxis_title_font': {'size': 18, 'family': "Arial"},
    'legend_title': '<b>Embedding + Classifier + Run Type</b>',
    'height': 600
})

# Show the figure
fig.show()

### Hyperparameter Tuning

#### Naive Bayes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Step 1: Split the data into training and testing sets
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_deduplicated['Clean_Text'], df_deduplicated['Label'], test_size=0.2, random_state=42)

# Step 2: Vectorization - Apply TF-IDF to the training and testing data separately
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)
X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)

# Step 3: Define the parameter grid for MultinomialNB
param_grid_nb = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Step 4: Initialize the GridSearchCV object
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5, scoring='recall')

# Step 5: Fit GridSearchCV to the training data
grid_search_nb.fit(X_train_tfidf, y_train)

# Best parameters and score
print("Best parameters for maximizing recall:", grid_search_nb.best_params_)
print("Best recall score:", grid_search_nb.best_score_)

#### Logistic Regression

In [None]:
# Define the parameter grid for LogisticRegression
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}

# Initialize the GridSearchCV object
grid_search_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5, scoring='recall')

# Fit it to the data
grid_search_lr.fit(X_train_tfidf, y_train)

# Best parameters and score
print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best recall score:", grid_search_lr.best_score_)

### Model Ensembling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score, make_scorer

# Pipelines for each classifier
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB(alpha=grid_search_nb.best_params_['alpha']))
])

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(C=grid_search_lr.best_params_['C'], solver=grid_search_lr.best_params_['solver'], max_iter=1000))
])

# Voting classifier with pipelines
voting_clf_pipeline = VotingClassifier(estimators=[
    ('nb', pipeline_nb), 
    ('lr', pipeline_lr)
], voting='soft')

# Fit the voting classifier on raw text data
voting_clf_pipeline.fit(X_train_raw, y_train)

# Evaluate the voting classifier
accuracy = voting_clf_pipeline.score(X_test_raw, y_test)
print(f'Voting Classifier Accuracy: {accuracy}')

# Make predictions on the test set
y_pred = voting_clf_pipeline.predict(X_test_raw)

# Calculate recall
recall = recall_score(y_test, y_pred, average='binary')  # adjust the average parameter as per your use case
print(f'Recall Score: {recall}')


### Neural Networks

In [None]:
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv1D, MaxPooling1D, LSTM, Bidirectional, Input, Reshape
# Assuming df_deduplicated['Clean_Text'] and df_deduplicated['Label'] are your data and labels

# Step 1: Split the data into training, validation, and testing sets
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_deduplicated['Clean_Text'], df_deduplicated['Label'], test_size=0.2, random_state=42)

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_raw)

# Convert text sequences into integer sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_test_seq = tokenizer.texts_to_sequences(X_test_raw)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Apply SMOTE for class imbalance
smote = SMOTE()
X_train_pad_resampled, y_train_resampled = smote.fit_resample(X_train_pad, y_train)


# Define CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=20000, output_dim=100, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(GlobalAveragePooling1D())
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define the model architecture
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=20000, output_dim=100, input_length=100))
lstm_model.add(Bidirectional(LSTM(64, return_sequences=True)))
lstm_model.add(Bidirectional(LSTM(32)))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define the original model
original_model = Sequential()
original_model.add(Embedding(input_dim=20000, output_dim=100, input_length=100))
original_model.add(GlobalAveragePooling1D())
original_model.add(Dense(1, activation='sigmoid'))
original_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train and evaluate models
cnn_model.fit(X_train_pad_resampled, y_train_resampled, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))
lstm_model.fit(X_train_pad_resampled, y_train_resampled, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))
original_model.fit(X_train_pad_resampled, y_train_resampled, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate models on test set
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(X_test_pad, y_test, verbose=0)
lstm_test_loss, lstm_test_acc = lstm_model.evaluate(X_test_pad, y_test, verbose=0)
original_test_loss, original_test_acc = original_model.evaluate(X_test_pad, y_test, verbose=0)

print(f'CNN Test Accuracy: {cnn_test_acc}')
print(f'LSTM Test Accuracy: {lstm_test_acc}')
print(f'Original Model Test Accuracy: {original_test_acc}')

In [None]:
# Define a function to generate Word2Vec embeddings
def generate_word2vec_embeddings(texts):
    model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
    embeddings = np.array([np.mean([model.wv[word] for word in text.split() if word in model.wv] or [np.zeros(100)], axis=0) for text in texts])
    return embeddings

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report

# Perform stratified train-test split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(sss.split(df_deduplicated['Clean_Text'], df_deduplicated['Label']))
X_train_raw, X_test_raw = df_deduplicated['Clean_Text'].iloc[train_index], df_deduplicated['Clean_Text'].iloc[test_index]
y_train, y_test = df_deduplicated['Label'].iloc[train_index], df_deduplicated['Label'].iloc[test_index]

# Generate Word2Vec embeddings for training and test data
X_train_embed = generate_word2vec_embeddings(X_train_raw)
X_test_embed = generate_word2vec_embeddings(X_test_raw)

# Apply SMOTE for class imbalance
smote = SMOTE()
X_train_embed_resampled, y_train_resampled = smote.fit_resample(X_train_embed, y_train)

In [None]:
# # Define CNN model with embeddings
# cnn_embed_model = Sequential()
# cnn_embed_model.add(Input(shape=(X_train_embed.shape[1],)))  # Input shape based on embeddings
# cnn_embed_model.add(Reshape((X_train_embed.shape[1], 1)))  # Reshape input to 3D
# cnn_embed_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
# cnn_embed_model.add(MaxPooling1D(pool_size=2))
# cnn_embed_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
# cnn_embed_model.add(MaxPooling1D(pool_size=2))
# cnn_embed_model.add(GlobalAveragePooling1D())
# cnn_embed_model.add(Dense(1, activation='sigmoid'))
# cnn_embed_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Define LSTM model with embeddings
# lstm_embed_model = Sequential()
# lstm_embed_model.add(Input(shape=(X_train_embed.shape[1],)))  # Input shape based on embeddings
# lstm_embed_model.add(Reshape((X_train_embed.shape[1], 1)))  # Reshape input to 3D
# lstm_embed_model.add(Bidirectional(LSTM(64, return_sequences=True)))
# lstm_embed_model.add(Bidirectional(LSTM(32)))
# lstm_embed_model.add(Dense(1, activation='sigmoid'))
# lstm_embed_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Define the original model with embeddings
# original_embed_model = Sequential()
# original_embed_model.add(Input(shape=(X_train_embed.shape[1],)))  # Input shape based on embeddings
# original_embed_model.add(Dense(64, activation='relu'))
# original_embed_model.add(Dense(1, activation='sigmoid'))
# original_embed_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Define autoencoder model with embeddings
# autoencoder_model = Sequential()
# autoencoder_model.add(Input(shape=(X_train_embed.shape[1],)))  # Input shape based on embeddings
# autoencoder_model.add(Dense(64, activation='relu'))  # Encoder layer
# autoencoder_model.add(Dense(X_train_embed.shape[1], activation='sigmoid'))  # Decoder layer
# autoencoder_model.add(Dense(1, activation='sigmoid'))  # Classification layer
# autoencoder_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

cnn_embed_model = Sequential()
cnn_embed_model.add(Input(shape=(X_train_embed.shape[1],)))
cnn_embed_model.add(Reshape((X_train_embed.shape[1], 1)))
cnn_embed_model.add(Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer='l2'))
cnn_embed_model.add(MaxPooling1D(pool_size=2))
cnn_embed_model.add(Conv1D(filters=64, kernel_size=3, activation='relu', kernel_regularizer='l2'))
cnn_embed_model.add(MaxPooling1D(pool_size=2))
cnn_embed_model.add(Conv1D(filters=32, kernel_size=3, activation='relu', kernel_regularizer='l2'))
cnn_embed_model.add(GlobalAveragePooling1D())
cnn_embed_model.add(Dense(64, activation='relu', kernel_regularizer='l2'))
cnn_embed_model.add(Dense(1, activation='sigmoid'))
cnn_embed_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
cnn_embed_model.fit(X_train_embed_resampled, y_train_resampled, epochs=50, batch_size=32, validation_data=(X_test_embed, y_test), callbacks=[early_stopping])

lstm_embed_model = Sequential()
lstm_embed_model.add(Input(shape=(X_train_embed.shape[1],)))
lstm_embed_model.add(Reshape((X_train_embed.shape[1], 1)))
lstm_embed_model.add(Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer='l2', recurrent_regularizer='l2')))
lstm_embed_model.add(Bidirectional(LSTM(32, return_sequences=True, kernel_regularizer='l2', recurrent_regularizer='l2')))
lstm_embed_model.add(Bidirectional(LSTM(16, kernel_regularizer='l2', recurrent_regularizer='l2')))
lstm_embed_model.add(Dense(64, activation='relu', kernel_regularizer='l2'))
lstm_embed_model.add(Dense(1, activation='sigmoid'))
lstm_embed_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lstm_embed_model.fit(X_train_embed_resampled, y_train_resampled, epochs=50, batch_size=32, validation_data=(X_test_embed, y_test), callbacks=[early_stopping])

original_embed_model = Sequential()
original_embed_model.add(Input(shape=(X_train_embed.shape[1],)))
original_embed_model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
original_embed_model.add(Dense(64, activation='relu', kernel_regularizer='l2'))
original_embed_model.add(Dense(32, activation='relu', kernel_regularizer='l2'))
original_embed_model.add(Dense(1, activation='sigmoid'))
original_embed_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
original_embed_model.fit(X_train_embed_resampled, y_train_resampled, epochs=50, batch_size=32, validation_data=(X_test_embed, y_test), callbacks=[early_stopping])

autoencoder_model = Sequential()
autoencoder_model.add(Input(shape=(X_train_embed.shape[1],)))
autoencoder_model.add(Dense(128, activation='relu', kernel_regularizer='l2'))  # Encoder layer
autoencoder_model.add(Dense(64, activation='relu', kernel_regularizer='l2'))  # Encoder layer
autoencoder_model.add(Dense(32, activation='relu', kernel_regularizer='l2'))  # Bottleneck layer
autoencoder_model.add(Dense(64, activation='relu', kernel_regularizer='l2'))  # Decoder layer
autoencoder_model.add(Dense(128, activation='relu', kernel_regularizer='l2'))  # Decoder layer
autoencoder_model.add(Dense(X_train_embed.shape[1], activation='sigmoid'))  # Output layer
autoencoder_model.add(Dense(64, activation='relu', kernel_regularizer='l2'))  # Classification layer
autoencoder_model.add(Dense(1, activation='sigmoid'))  # Classification output
autoencoder_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
autoencoder_model.fit(X_train_embed_resampled, y_train_resampled, epochs=50, batch_size=32, validation_data=(X_test_embed, y_test), callbacks=[early_stopping])

In [None]:
# Evaluate models on test set
cnn_embed_test_loss, cnn_embed_test_acc = cnn_embed_model.evaluate(X_test_embed, y_test, verbose=0)
lstm_embed_test_loss, lstm_embed_test_acc = lstm_embed_model.evaluate(X_test_embed, y_test, verbose=0)
autoencoder_embed_test_loss, autoencoder_embed_test_acc = autoencoder_model.evaluate(X_test_embed, y_test, verbose=0)
original_embed_test_loss, original_embed_test_acc = original_embed_model.evaluate(X_test_embed, y_test, verbose=0)

print(f'CNN with Word2Vec Embeddings Test Accuracy: {cnn_embed_test_acc}')
print(f'LSTM with Word2Vec Embeddings Test Accuracy: {lstm_embed_test_acc}')
print(f'Autoencoder Model with Word2Vec Embeddings Test Accuracy: {autoencoder_embed_test_acc}')
print(f'Original Model with Word2Vec Embeddings Test Accuracy: {original_embed_test_acc}')

In [None]:
# Adjust the threshold for class 1
threshold = 0.2

# Evaluate models on test set
cnn_embed_pred = (cnn_embed_model.predict(X_test_embed) > threshold).astype(int)
print("CNN with Word2Vec Embeddings Classification Report:")
print(classification_report(y_test, cnn_embed_pred))

lstm_embed_pred = (lstm_embed_model.predict(X_test_embed) > threshold).astype(int)
print("LSTM with Word2Vec Embeddings Classification Report:")
print(classification_report(y_test, lstm_embed_pred))

autoencoder_pred = (autoencoder_model.predict(X_test_embed) > threshold).astype(int)
print("Autoencoder with Word2Vec Embeddings Classification Report:")
print(classification_report(y_test, autoencoder_pred))

original_embed_pred = (original_embed_model.predict(X_test_embed) > threshold).astype(int)
print("Original Model with Word2Vec Embeddings Classification Report:")
print(classification_report(y_test, original_embed_pred))

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Load the pre-trained models
sbert_model = SentenceTransformer('stsb-mpnet-base-v2')
simcse_sup_model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')
simcse_unsup_model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')

# Generate embeddings for training and test data
X_train_sbert = sbert_model.encode(X_train_raw.tolist(), show_progress_bar=True)
X_test_sbert = sbert_model.encode(X_test_raw.tolist(), show_progress_bar=True)

X_train_simcse_sup = simcse_sup_model.encode(X_train_raw.tolist(), show_progress_bar=True)
X_test_simcse_sup = simcse_sup_model.encode(X_test_raw.tolist(), show_progress_bar=True)

X_train_simcse_unsup = simcse_unsup_model.encode(X_train_raw.tolist(), show_progress_bar=True)
X_test_simcse_unsup = simcse_unsup_model.encode(X_test_raw.tolist(), show_progress_bar=True)

# Apply SMOTE for class imbalance
smote = SMOTE()
X_train_sbert_resampled, y_train_resampled = smote.fit_resample(X_train_sbert, y_train)
X_train_simcse_sup_resampled, _ = smote.fit_resample(X_train_simcse_sup, y_train)
X_train_simcse_unsup_resampled, _ = smote.fit_resample(X_train_simcse_unsup, y_train)

# Train and evaluate models
models = {
    'SBERT': LogisticRegression(),
    'SimCSE Supervised': LogisticRegression(),
    'SimCSE Unsupervised': LogisticRegression()
}

embeddings = {
    'SBERT': (X_train_sbert_resampled, X_test_sbert),
    'SimCSE Supervised': (X_train_simcse_sup_resampled, X_test_simcse_sup),
    'SimCSE Unsupervised': (X_train_simcse_unsup_resampled, X_test_simcse_unsup)
}

for model_name, model in models.items():
    X_train_emb, X_test_emb = embeddings[model_name]
    model.fit(X_train_emb, y_train_resampled)
    y_pred = model.predict(X_test_emb)
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
# Data Augmentation Functions
def back_translation(texts, src_lang='en', tgt_lang='fr'):
    from googletrans import Translator
    translator = Translator()
    augmented_texts = []
    for text in texts:
        translation = translator.translate(text, src=src_lang, dest=tgt_lang)
        back_translation = translator.translate(translation.text, src=tgt_lang, dest=src_lang)
        augmented_texts.append(back_translation.text)
    return augmented_texts

def synonym_replacement(texts, n=1):
    import nltk
    from nltk.corpus import wordnet
    augmented_texts = []
    for text in texts:
        words = text.split()
        new_words = []
        for word in words:
            synonyms = []
            for syn in wordnet.synsets(word):
                for lemma in syn.lemmas():
                    synonym = lemma.name().replace('_', ' ')
                    if synonym != word:
                        synonyms.append(synonym)
            if len(synonyms) > 0:
                new_words.append(np.random.choice(synonyms, n))
            else:
                new_words.append(word)
        augmented_texts.append(' '.join(new_words))
    return augmented_texts

def random_insertion(texts, n=1):
    augmented_texts = []
    for text in texts:
        words = text.split()
        new_words = []
        for word in words:
            new_words.append(word)
            if np.random.random() < 0.5:
                new_words.extend([chr(np.random.randint(97, 123)) for _ in range(n)])
        augmented_texts.append(' '.join(new_words))
    return augmented_texts

def random_deletion(texts, p=0.2):
    augmented_texts = []
    for text in texts:
        words = text.split()
        new_words = []
        for word in words:
            if np.random.random() > p:
                new_words.append(word)
        augmented_texts.append(' '.join(new_words))
    return augmented_texts

def random_swap(texts, n=1):
    augmented_texts = []
    for text in texts:
        words = text.split()
        new_words = []
        for i in range(len(words)):
            if np.random.random() < 0.5:
                new_words.append(words[i])
            else:
                new_idx = np.random.randint(max(0, i - n), min(len(words), i + n + 1))
                new_words.append(words[new_idx])
        augmented_texts.append(' '.join(new_words))
    return augmented_texts

def semantic_augmentation(texts, model_name='distilbert-base-nli-mean-tokens'):
    from transformers import pipeline
    augmented_texts = []
    generator = pipeline('text-generation', model=model_name)
    for text in texts:
        augmented_texts.append(generator(text, max_length=100, num_return_sequences=1)[0]['generated_text'])
    return augmented_texts

def mixed_augmentation(texts):
    augmented_texts = back_translation(texts)
    augmented_texts = synonym_replacement(augmented_texts)
    augmented_texts = random_insertion(augmented_texts)
    augmented_texts = random_deletion(augmented_texts)
    augmented_texts = random_swap(augmented_texts)
    augmented_texts = semantic_augmentation(augmented_texts)
    return augmented_texts

# Augment the training data
X_train_augmented = mixed_augmentation(X_train)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_augmented)
X_train_seq = tokenizer.texts_to_sequences(X_train_augmented)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Define CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=20000, output_dim=100, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(GlobalAveragePooling1D())
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=20000, output_dim=100, input_length=100))
lstm_model.add(Bidirectional(LSTM(64, return_sequences=True)))
lstm_model.add(Bidirectional(LSTM(32)))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define the original model
original_model = Sequential()
original_model.add(Embedding(input_dim=20000, output_dim=100, input_length=100))
original_model.add(GlobalAveragePooling1D())
original_model.add(Dense(1, activation='sigmoid'))
original_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train and evaluate models
cnn_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))
lstm_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))
original_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate models on test set
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(X_test_pad, y_test, verbose=0)
lstm_test_loss, lstm_test_acc = lstm_model.evaluate(X_test_pad, y_test, verbose=0)
original_test_loss, original_test_acc = original_model.evaluate(X_test_pad, y_test, verbose=0)

print(f'CNN Test Accuracy: {cnn_test_acc}')
print(f'LSTM Test Accuracy: {lstm_test_acc}')
print(f'Original Model Test Accuracy: {original_test_acc}')

In [None]:
# Prediction on test data
y_pred_probs = model.predict(X_test_pad)
y_pred = np.round(y_pred_probs).astype(int)  # Converting probabilities to binary output

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

print("AUC:", roc_auc)

In [None]:
# Assuming y_test are your true labels and y_pred are your model's predictions
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print("Recall:", recall)
print("Precision:", precision)


## Classifier Modeling (Advanced ML)

### Embeddings for Context Capture

In [None]:
# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Splitting the deduplicated DataFrame into training, testing, and validation sets
train, test = train_test_split(df_deduplicated, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

In [None]:
# Positive and negative sentences
positive_sentences = train[train['Label'] == 1]['Clean_Text'].tolist()
negative_sentences = train[train['Label'] == 0]['Clean_Text'].tolist()

# Validation set sentences
val_sentences = val['Clean_Text'].tolist()
val_labels = val['Label'].tolist()

In [None]:
# Compute embeddings
positive_embeddings = model.encode(positive_sentences, convert_to_tensor=True)
negative_embeddings = model.encode(negative_sentences, convert_to_tensor=True)
val_embeddings = model.encode(val_sentences, convert_to_tensor=True)

#### Transformers for Advanced Modeling

In [None]:
# Compute cosine similarities between validation sentences and positive (ground truth) sentences
cosine_scores_val = util.cos_sim(val_embeddings, positive_embeddings)

# Convert to numpy for easier processing
cosine_scores_val_np = cosine_scores_val.numpy()

# Determine predictions based on the threshold
def determine_predictions(scores, threshold):
    predictions = []
    for score_row in scores:
        # Each row corresponds to comparisons of one validation sentence against all positive sentences
        max_score = np.max(score_row)  # Find the max score in comparisons against positive sentences
        predictions.append(1 if max_score >= threshold else 0)
    return predictions

# Iterate through thresholds to find the best one based on recall

thresholds = np.arange(0, 1.05, 0.05)
best_recall = 1
best_threshold = 0.1
for threshold in thresholds:
    preds = determine_predictions(cosine_scores_val_np, threshold)
    recall = recall_score(val_labels, preds)
    if recall > best_recall:
        best_recall = recall
        best_threshold = threshold

print(f"Optimal threshold: {best_threshold} with Recall: {best_recall}")

In [None]:
# Function to find example sentences based on score criteria
def find_example_sentences(scores, val_sentences, positive_sentences, criteria='high'):
    if criteria == 'high':
        threshold = np.max(scores) - 0.1  # Adjust as needed
    elif criteria == 'low':
        threshold = np.min(scores) + 0.1  # Adjust as needed
    else:  # middle
        threshold = np.median(scores)
    
    # Find index of the sentence pair that meets the criteria
    if criteria in ['high', 'low']:
        idx = np.argmax(scores) if criteria == 'high' else np.argmin(scores)
        val_idx, pos_idx = np.unravel_index(idx, scores.shape)
    else:
        # For middle, find the closest to the median
        abs_diff = np.abs(scores - threshold)
        val_idx, pos_idx = np.unravel_index(np.argmin(abs_diff), scores.shape)
    
    return val_sentences[val_idx], positive_sentences[pos_idx], scores[val_idx, pos_idx]

# Examples
criteria_list = ['high', 'low', 'middle']

for criteria in criteria_list:
    val_sentence, pos_sentence, score = find_example_sentences(cosine_scores_val_np, val_sentences, positive_sentences, criteria)
    print(f"Criteria: {criteria}")
    print(f"Validation Sentence: {val_sentence}")
    print(f"Positive Sentence: {pos_sentence}")
    print(f"Cosine Similarity Score: {score}\n")

### Semantic Similarity

#### Adding Ground Truth

In [None]:
df_truth = pd.read_csv('Ground_Truth.csv')
df_joined = pd.merge(df_truth, df, how='left', left_on=['Image Id', 'Body'], right_on=['img_id', 'Body'])
df_joined = df_joined.drop_duplicates(subset=['Image Id', 'Body'], keep='first')

df_joined['Body_Clean_Basics'] = df_joined['Body'].apply(clean_text)
df_joined['Title_Clean_Basics'] = df_joined['Title'].apply(clean_text)

df_joined['Body_Clean'] = df_joined['Body_Clean_Basics'].apply(lambda x: clean_text_advanced(x, 
                                                                                            method="lemmatization",
                                                                                            remove_special_chars=True, 
                                                                                            remove_numbers=False, 
                                                                                            use_custom_stopwords=False, 
                                                                                            custom_stopwords=set()))
df_joined['Title_Clean'] = df_joined['Title_Clean_Basics'].apply(lambda x: clean_text_advanced(x, 
                                                                                            method="lemmatization",
                                                                                            remove_special_chars=True, 
                                                                                            remove_numbers=False, 
                                                                                            use_custom_stopwords=False, 
                                                                                            custom_stopwords=set()))

df_joined['Clean_Text'] = df_joined['Title_Clean'].astype(str) + '\n\n' + df_joined['Body_Clean'].astype(str)
df_joined['Clean_Text_Basic'] = df_joined['Title_Clean_Basics'].astype(str) + '\n\n' + df_joined['Body_Clean_Basics'].astype(str)

In [None]:
def compare_similarity(df_ground_truth, df_comparison, embedding_option, text_column_name):
    # Load the model based on the selected embedding option
    model = SentenceTransformer(embedding_option)
    
    # Generate embeddings for ground truth and comparison sentences
    embeddings_ground_truth = model.encode(df_ground_truth[text_column_name].tolist(), convert_to_tensor=True)
    embeddings_comparison = model.encode(df_comparison[text_column_name].tolist(), convert_to_tensor=True)
    
    # Compute cosine similarities between ground truth and comparison sentences
    cosine_scores = util.cos_sim(embeddings_ground_truth, embeddings_comparison)
    
    # Prepare column names for similarity scores
    similarity_columns = [f"GT_{i}" for i in range(len(df_ground_truth))]  
    
    # Convert cosine_scores to a DataFrame for easier manipulation
    cosine_scores_df = pd.DataFrame(cosine_scores.numpy().T, columns=similarity_columns)
    
    # Concatenate the original comparison sentences with their similarity scores
    results_df = pd.concat([df_comparison[[text_column_name]].reset_index(drop=True), cosine_scores_df], axis=1)
    
    return results_df

In [None]:
# Obtain 20 random samples from df_deduplicated
random_samples = df_deduplicated.sample(n=20)

In [None]:
# Call the function with the desired embedding option
results_df = compare_similarity(df_joined, random_samples, 'all-MiniLM-L6-v2', 'Clean_Text')

# Display the results
if results_df is not None:
    display(results_df)