# Import Libraries

In [None]:
import numpy as np
import pandas as pd

# Preprocessing
import regex as re
import demoji
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import gzip

# Modelling
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Preprocessing

## Import Data

In [None]:
file = 'Data Twitter LABELED.csv'
df = pd.read_csv(file, sep=';', header=None, names=["Timestamp", "text", "sarcasm"], encoding='iso-8859-1')
df.head()

## Cleaning

In [None]:
def clean_text(text):
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\b\d+\b', '', text)  # Remove numbers
    text = demoji.replace(text, '')  # Remove emoji
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

df['text'] = df['text'].astype(str)
df['cleaned_text'] = df['text'].apply(clean_text)
print(df['cleaned_text'])

## Case Folding

In [None]:
df['casefolded_text'] = df['cleaned_text'].str.lower()
print(df['casefolded_text'])

## Tokenizing

In [None]:
df['tokenized_text'] = df['casefolded_text'].apply(nltk.word_tokenize)
print(df['tokenized_text'])

## Spelling Normalization

In [None]:
with open('combined_slang_words.txt', 'r') as file:
    slang_dict = eval(file.read())

def normalize_text(tokenized_text):
    return [slang_dict[word] if word in slang_dict else word for word in tokenized_text]

df['normalized_text'] = df['tokenized_text'].apply(normalize_text)
print(df['normalized_text'])

## Stopword Removal

In [None]:
def remove_stopwords(normalized_text):
    stop_words = set(stopwords.words("indonesian"))
    return [word for word in normalized_text if word not in stop_words]

df['stpwrdrmv_text'] = df['normalized_text'].apply(remove_stopwords)
print(df[''])

## Stemming

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(normalized_text):
    return [stemmer.stem(word) for word in normalized_text]

df['stemmed_text'] = df['stpwrdrmv_text'].apply(stem_text)
print(df['stemmed_text'])

## Final Text

In [None]:
df['final_text'] = df['stemmed_text'].apply(lambda x: ' '.join(x))
print(df['final_text'])
df.to_csv('final_text.csv', index=False)

file = 'final_text.csv'
df = pd.read_csv(file)
df['final_text'] = df['final_text'].astype(str)
df.drop_duplicates(inplace=True)

# Word Embedding

In [None]:
# Create embedding matrix
def create_embedding_matrix(vocab_and_vectors, word_index, embedding_dim):
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, i in word_index.items():
        if i >= num_words:
            continue
        vector = vocab_and_vectors.get(word)
        if vector is not None:
            embedding_matrix[i] = vector
    return embedding_matrix

# FastText embedding dimensions
embedding_dim = 300
trunc_type = 'pre'
padding_type = 'pre'
oov_tok = "<OOV>"

# Initialize Tokenizer and Word Index
tokenizer = Tokenizer(oov_token=oov_tok)

# Tokenize and pad the text data
# Define the maximum sequence length
tokenizer.fit_on_texts(df['final_text'])
X_sequences = tokenizer.texts_to_sequences(df['final_text'])
max_length = max(len(x) for x in X_sequences)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['sarcasm'])

# FastText model for Indonesian language
file = 'cc.id.300.vec.gz'
print('\nWord Embedding in Process . . .')
with gzip.open(file, 'rt', encoding='utf-8') as file:
    vocab_and_vectors = {}  # Map words to vectors
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        vocab_and_vectors[word] = vector

# Create embedding matrix
embedding_matrix = create_embedding_matrix(vocab_and_vectors, tokenizer.word_index, embedding_dim)
print('Word Embedding Done')

# Display an example of word indexing
print("Original Text:\n", df['final_text'][2])
print("\nIndexed Sequence:\n", X_sequences[2])

# Display an example of padded sequence
X_padded = pad_sequences([X_sequences[2]], maxlen=max_length)
print("\nPadded Sequence:\n", X_padded)

# Display an example of word embedding
word_index = tokenizer.word_index
example_word = list(word_index.keys())[1]
example_embedding = vocab_and_vectors.get(example_word)
print("\nExample Word Embedding for '{}':\n".format(example_word), example_embedding)

# Count unique words
print('Found %s unique tokens.' % len(word_index))

# Oversampling Method Selection

In [None]:
# Initialize best results
best_f1score = 0
best_result = {'resampling_technique': None, 'hyperparameters': None, 'f1-score': None}

# Data resampling techniques
resampling_techniques = {
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'RandomOverSampler': RandomOverSampler(random_state=42)
}

# Dictionary to store results
results_dict = {'resampling_technique': [], 'hyperparameters': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1-score': []}

# Hyperparameters for BiLSTM model
param_grid = {
    'dropout_rate': [0.1],
    'batch_size': [16],
    'optimizer': ['adam'],
    'bilstm_neuron': [32],
    'neuron': [32],
    'lr': [0.01]
}

# Loop through resampling techniques
for resampling_name, resampling_technique in resampling_techniques.items():

    # Apply resampling to the entire dataset
    X_resampled, y_resampled = resampling_technique.fit_resample(X_padded, y_encoded)
    # Sample Counter
    print('\nSample Counter with', resampling_name, ":")
    counter1 = Counter(y_encoded)
    print('Before Resampling\n', counter1)
    counter2 = Counter(y_resampled)
    print('After Resampling\n', counter2)

    # Plot the class distribution before and after resampling
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.bar(counter1.keys(), counter1.values(), color='blue')
    plt.title('Class Distribution Before Resampling')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.subplot(1, 2, 2)
    plt.bar(counter2.keys(), counter2.values(), color='green')
    plt.title('Class Distribution After Resampling')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

    # Split the data into 80% training and 20% temporary data
    X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

    # Further split the temporary data into 50% validation and 50% testing
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Loop through hyperparameters
    for params in ParameterGrid(param_grid):
        print(f"\nResampling Technique: {resampling_name}, Hyperparameters: {params}")

        # Build BiLSTM model
        model = Sequential()
        model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False))
        model.add(Bidirectional(LSTM(params['bilstm_neuron'])))
        model.add(Dropout(params['dropout_rate']))
        model.add(Dense(params['neuron'], activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        # Compile the model with the specified optimizer and learning rate
        optimizer = Adam(learning_rate=params['lr']) if params['optimizer'] == 'adam' else params['optimizer']
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True, verbose=1)

        # Train model
        print('\nTraining in Process . . .')
        history = model.fit(X_train, y_train, epochs=100, batch_size=params['batch_size'], validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=1)
        print('Training Done')

        # Evaluate model on test data
        _, accuracy = model.evaluate(X_test, y_test)
        print(f'Accuracy on test data: {accuracy}')

        # Plotting loss and accuracy graphs
        def plot_graphs(history):
            plt.figure(figsize=(16, 5))
            # Plot Accuracy
            plt.subplot(1, 2, 1)
            plt.plot(history.history['accuracy'])
            plt.plot(history.history['val_accuracy'])
            plt.xlabel("Epochs")
            plt.ylabel("Accuracy")
            plt.legend(['accuracy', 'val_accuracy'])
            plt.title("Accuracy")

            # Plot Loss
            plt.subplot(1, 2, 2)
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.xlabel("Epochs")
            plt.ylabel("Loss")
            plt.legend(['loss', 'val_loss'])
            plt.title("Loss")

            plt.show()

        print('\nAccuracy and Loss Plot')
        plot_graphs(history)

        # Evaluate the model on the test set
        y_pred_prob = model.predict(X_test)
        y_pred = (y_pred_prob > 0.5).astype(int)
        print('\nClassification Report:')
        print(classification_report(y_test, y_pred, digits=3))

        # Classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        precision = report['1']['precision']
        recall = report['1']['recall']
        f1 = report['1']['f1-score']

        # Create a confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        # Plot the confusion matrix
        def plot_confusion_matrix(conf_matrix):
            plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
            plt.title('Confusion Matrix')
            plt.colorbar()
            classes = ['Not Sarcasm', 'Sarcasm']
            tick_marks = np.arange(len(classes))
            plt.xticks(tick_marks, classes, rotation=45)
            plt.yticks(tick_marks, classes)

            plt.ylabel('True label')
            plt.xlabel('Predicted label')
            plt.show()

        print('\nConfusion Matrix:')
        plot_confusion_matrix(conf_matrix)

        # Update results_dict
        results_dict['resampling_technique'].append(resampling_name)
        results_dict['hyperparameters'].append(params)
        results_dict['accuracy'].append(accuracy)
        results_dict['precision'].append(precision)
        results_dict['recall'].append(recall)
        results_dict['f1-score'].append(f1)

        # Check if current result is the best for F1-score
        if f1 > best_f1score:
            best_f1score = f1
            best_result = {'resampling_technique': resampling_name, 'hyperparameters': params, 'f1-score': best_f1score}

# Print the best result
print("\nBest Result:")
print(f"Resampling Technique: {best_result['resampling_technique']}")
print(f"Hyperparameters: {best_result['hyperparameters']}")
print(f"F1-score: {best_result['f1-score']}")

# Print compiled results table
results_df = pd.DataFrame(results_dict)
print("\nCompiled Results Table:")
print(results_df)
results_df.to_csv('results1.csv')


# Hyperparameter Tuning

In [None]:
# Initialize best results
best_f1score = 0
best_result = {'resampling_technique': None, 'hyperparameters': None, 'f1-score': None}

# Data resampling techniques
resampling_techniques = {
    'RandomOverSampler': RandomOverSampler(random_state=42)
}

# Dictionary to store results
results_dict = {'resampling_technique': [], 'hyperparameters': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1-score': []}

# Hyperparameters for BiLSTM model
param_grid = {
    'dropout_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
    'batch_size': [16, 32, 64, 128],
    'optimizer': ['adam'],
    'bilstm_neuron': [32, 64, 128],
    'neuron': [32, 64, 128],
    'lr': [0.01, 0.001, 0.0001]
}

# Loop through resampling techniques
for resampling_name, resampling_technique in resampling_techniques.items():

    # Apply resampling to the entire dataset
    X_resampled, y_resampled = resampling_technique.fit_resample(X_padded, y_encoded)
    # Sample Counter
    print('\nSample Counter with', resampling_name, ":")
    counter1 = Counter(y_encoded)
    print('Before Resampling\n', counter1)
    counter2 = Counter(y_resampled)
    print('After Resampling\n', counter2)

    # Plot the class distribution before and after resampling
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.bar(counter1.keys(), counter1.values(), color='blue')
    plt.title('Class Distribution Before Resampling')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.subplot(1, 2, 2)
    plt.bar(counter2.keys(), counter2.values(), color='green')
    plt.title('Class Distribution After Resampling')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

    # Data splitting 80:10:10
    # Split the data into 80% training and 20% temporary data
    X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
    # Further split the temporary data into 50% validation and 50% testing
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Loop through hyperparameters
    for params in ParameterGrid(param_grid):
        print(f"\nResampling Technique: {resampling_name}, Hyperparameters: {params}")

        # Build BiLSTM model
        model = Sequential()
        model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False))
        model.add(Bidirectional(LSTM(params['bilstm_neuron'])))
        model.add(Dropout(params['dropout_rate']))
        model.add(Dense(params['neuron'], activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        # Compile the model with the specified optimizer and learning rate
        if params['optimizer'] == 'adam':
            optimizer = Adam(learning_rate=params['lr'])
        else:
            optimizer = params['optimizer']

        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True, verbose=1)

        # Train model
        print('\nTraining in Process . . .')
        history = model.fit(X_train, y_train, epochs=100, batch_size=params['batch_size'], validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=1)
        print('Training Done')

        # Evaluate model on test data
        loss, accuracy = model.evaluate(X_test, y_test)
        print(f'Loss on test data: {loss}')
        print(f'Accuracy on test data: {accuracy}')

        # Plotting loss and accuracy graphs
        # Plot Loss and Accuracy
        def plot_graphs(history):
            plt.figure(figsize=(16, 5))
            # Plot Accuracy
            plt.subplot(1, 2, 1)
            plt.plot(history.history['accuracy'])
            plt.plot(history.history['val_accuracy'])
            plt.xlabel("Epochs")
            plt.ylabel("Accuracy")
            plt.legend(['accuracy', 'val_accuracy'])
            plt.title("Accuracy")

            # Plot Loss
            plt.subplot(1, 2, 2)
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.xlabel("Epochs")
            plt.ylabel("Loss")
            plt.legend(['loss', 'val_loss'])
            plt.title("Loss")

            # Place the title in the center above the subplots
            plt.show()

        print('\nAccuracy and Loss Plot')
        plot_graphs(history)

        # Evaluate the model on the test set
        y_pred_prob = model.predict(X_test)
        # Thresholding to get predicted classes (0 or 1)
        y_pred = (y_pred_prob > 0.5).astype(int)
        print('\nClassification Report:')
        print(classification_report(y_test, y_pred, digits=4))

        # Classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        precision = report['1']['precision']
        recall = report['1']['recall']
        f1 = report['1']['f1-score']

        # Create a confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        # Plot the confusion matrix
        def plot_confusion_matrix(conf_matrix):
            plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
            plt.title('Confusion Matrix')
            plt.colorbar()
            classes = ['Not Sarcasm', 'Sarcasm']
            tick_marks = np.arange(len(classes))
            plt.xticks(tick_marks, classes, rotation=45)
            plt.yticks(tick_marks, classes)
            plt.ylabel('True label')
            plt.xlabel('Predicted label')

            # Add text to each cell in the matrix
            for i in range(len(classes)):
                for j in range(len(classes)):
                    text_color = 'black' if i == 1 and j == 0 or i == 0 and j == 1 else 'white'
                    plt.text(j, i, str(conf_matrix[i, j]), horizontalalignment='center', verticalalignment='center', color=text_color)
            plt.show()

        print('\nConfusion Matrix:')
        print(conf_matrix)
        plot_confusion_matrix(conf_matrix)

        # Update results_dict
        results_dict['resampling_technique'].append(resampling_name)
        results_dict['hyperparameters'].append(params)
        results_dict['accuracy'].append(accuracy)
        results_dict['precision'].append(precision)
        results_dict['recall'].append(recall)
        results_dict['f1-score'].append(f1)

        # Check if current result is the best for both ROC-AUC and F1-score
        if f1 > best_f1score:
            best_f1score = f1
            best_result = {'resampling_technique': resampling_name, 'hyperparameters': params, 'f1-score': best_f1score}

# Print the best result
print("\nBest Result:")
print(f"Resampling Technique: {best_result['resampling_technique']}")
print(f"Hyperparameters: {best_result['hyperparameters']}")
print(f"F1-score: {best_result['f1-score']}")

# Print compiled results table
results_df = pd.DataFrame(results_dict)
print("\nCompiled Results Table:")
print(results_df)
results_df.to_csv('results2.csv')
