<a href="https://colab.research.google.com/github/sinhajiya/DSE318-NLP-Assignment-Solutions/blob/main/Assignment2/22161_jiyasinha_nlpassignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORTING LIBRARIES

In [None]:
import os
import numpy as np
import pandas as pd
from random import randint, sample, seed, choice
import tensorflow as tf
import re
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight


In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found')
else:
  print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## LOADING DATASET

In [None]:
!git clone https://github.com/islnlp/Assignment_1_2025

fatal: destination path 'Assignment_1_2025' already exists and is not an empty directory.


In [None]:
!git clone https://github.com/sinhajiya/DSE318-NLP-Assignment-Solutions

Cloning into 'DSE318-NLP-Assignment-Solutions'...
remote: Enumerating objects: 127, done.[K
remote: Total 127 (delta 0), reused 0 (delta 0), pack-reused 127 (from 2)[K
Receiving objects: 100% (127/127), 149.64 MiB | 13.55 MiB/s, done.
Resolving deltas: 100% (25/25), done.


In [None]:
def pull():
  %cd DSE318-NLP-Assignment-Solutions/
  !git pull origin main
  %cd ..

In [None]:
def load_data(name):
  root_fp = f"/content/Assignment_1_2025/{name}"
  train = pd.read_csv(os.path.join(root_fp, "train.csv"))
  val = pd.read_csv(os.path.join(root_fp, "val.csv"))
  train = train.dropna(subset=['Sentence'])
  val = val.dropna(subset=['Sentence'])
  return train, val

In [None]:
def preprocess_text(Sentence):

  # Preprocessing steps:
  # 1. All lower case characters
  # 2. URL removal
  # 3. Multiple dots to single dot
  # 4. Extra spaces to single space
  # 5. Removes non-alphabetic chars

    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    Sentence = Sentence.lower()
    Sentence = re.sub(url_pattern, "", Sentence)
    Sentence = re.sub(r"\.{2,}", ".", Sentence)
    Sentence = re.sub(r"\s+", " ", Sentence).strip()
    Sentence = re.sub(r"[^a-zA-Z\s]", "", Sentence)
    return Sentence

In [None]:
def load_and_preprocess_data(name):

  train, val = load_data(name)
  train["Sentence_preprocessed"] = train["Sentence"].astype(str).apply(preprocess_text)
  val["Sentence_preprocessed"] = val["Sentence"].astype(str).apply(preprocess_text)

  return train, val

In [None]:
def form_vocab(data,isdataframe=True):

  vocab_size = 0
  vocab = set()
  word2index = dict()  # Gives mapping from index to word
  index2word = dict()  # Gives mapping of word to index

  if isdataframe:
    data = data["Sentence_preprocessed"]

  for sentence in data:
    for word in sentence.split():
      if word not in word2index:
        word2index[word] = vocab_size
        index2word[vocab_size] = word
        vocab.add(word)
        vocab_size += 1
  print(f"Vocabulary of {vocab_size} created")
  return vocab, vocab_size, word2index, index2word


## WORD2VEC

In [None]:
def skip_gram(data, window_size,k=5, isdataframe=True):

  seed(42)
  positive_samples = dict()
  negative_samples = dict()
  target_words = set()

  if isdataframe:
    data=data['Sentence_preprocessed']

  vocab, _, _ , _ = form_vocab(data, isdataframe=False)
  # q=1
  for sentence in data:
    words = sentence.split()
    num_words = len(words)
    # print(f"on sentece {q}")
    # q+=1
    for i in range(0,num_words):
        target_words.add(words[i])
        if words[i] not in positive_samples:
          positive_samples[words[i]] = set()
        if words[i] not in negative_samples:
          negative_samples[words[i]] = set()

        for j in range(1,window_size+1):
          if ((i-j)>=0):
            positive_samples[words[i]].add(words[i-j])
          if((i+j)<num_words):
            positive_samples[words[i]].add(words[i+j])
        # print(f"Data type of vocab is {type(vocab)}, words[[i]] is {type(words[i])}, positive_words is {type(positive_samples[words[i]])}")
        negative_words =  vocab - set(words[i]) - positive_samples[words[i]]
        negative_samples[words[i]] = set(sample(list(negative_words), k))
  positive_samples = {target: list(context) for target, context in positive_samples.items()}
  negative_samples = {target: list(context) for target, context in negative_samples.items()}

  return positive_samples, negative_samples, list(target_words)

In [None]:
def convert_to_index(positive_samples, negative_samples,target_words, word2index):

  target_words_index = list()
  positive_samples_index = dict()
  negative_samples_index = dict()

  for k in (target_words):
    w_idx = word2index[k]
    target_words_index.append(w_idx)

    positive_samples_index[w_idx] = [word2index[i] for i in positive_samples[k]]
    negative_samples_index[w_idx] = [word2index[i] for i in negative_samples[k]]

  return target_words_index, positive_samples_index, negative_samples_index

In [None]:
def create_target_and_context_index(data, window_size,word2index, index2word, k=5, isdataframe=True):
  print("The data is of shape:\t", data.shape)
  positive_samples, negative_samples, target_words = skip_gram(data, window_size,k=5, isdataframe=True)
  print("Created the target words and positive and negative context words pair.\n")
  w = choice(target_words)
  idx = word2index[w]
  print("Printing an example...")
  print(f"For word at index {idx}:\t {w}")
  print("It's positive samples are:\t",positive_samples[w])
  print("It's negative samples are:\t",negative_samples[w])
  target_words_index, positive_samples_index, negative_samples_index = convert_to_index(positive_samples, negative_samples, target_words, word2index)
  print(f"\nCreated the target words and context words pair using the index for training.\n ")
  print(index2word[idx], "index:\t", idx)
  print("Postive samples are:\t",positive_samples_index[idx])
  print("Negative samples are:\t",negative_samples_index[idx])

  return target_words_index, positive_samples_index, negative_samples_index

In [None]:
def form_data(target_words_index, positive_samples_index, negative_samples_index, index2word):
  X_train = []
  y_train = []

  for target in target_words_index:

    if target not in positive_samples_index or len(positive_samples_index[target]) == 0:
      print(f"No positive samples for target index {target}: {index2word[target]}")
      continue

    if target not in negative_samples_index or len(negative_samples_index[target]) == 0:
      print(f"No negative samples for target index {target}: {index2word[target]}")
      continue
    target_positive_pairs = np.array([[target,pos] for pos in positive_samples_index[target]],dtype=np.int32)
    X_train.append(target_positive_pairs)
    y_train.append(np.ones(len(target_positive_pairs),dtype=np.int32))

    target_negative_pairs = [[target,neg] for neg in negative_samples_index[target]]
    X_train.append(target_negative_pairs)
    y_train.append(np.zeros(len(target_negative_pairs),dtype=np.int32))

  return np.vstack(X_train), np.concatenate(y_train)

In [None]:
def create_training_data(target_words_index, positive_samples_index, negative_samples_index, index2word):
  X_train,y_train = form_data(target_words_index, positive_samples_index, negative_samples_index, index2word)
  print(f"The total number of target words are:\t{len(target_words_index)}.\nThe total number of positive samples are:\t{len(positive_samples_index)}.\nThe total number of negative samples are:\t{len(negative_samples_index)}")
  print("\nCreated data for training.")
  print("Shape of training data is", X_train.shape)
  print("Printing the head of the training data..\n")
  print(pd.DataFrame(X_train).head)
  return X_train, y_train

In [None]:
def create_word2vec(X_train, y_train, vocab_size, name, index2word, word2index, epochs=10, batch_size=32):

    target_words = X_train[:, 0]
    context_words = X_train[:, 1]

    # Input: (target words,context words)
    target_ip = tf.keras.Input(shape=(),dtype=np.int32,name='target')
    context_ip = tf.keras.Input(shape=(),dtype=np.int32,name='context')

    target_emb_layers = tf.keras.layers.Embedding(vocab_size, 100, name='target_embed')  # Initializes a random embedding for target words.
    context_emb_layers = tf.keras.layers.Embedding(vocab_size, 100, name='context_embed')  # Initializes a random embedding for context words.

    target_emb = target_emb_layers(target_ip)
    context_emb = context_emb_layers(context_ip)

    # Dot product between tsrget and context embedding for finding the similiarity
    dot_product = tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x[0] * x[1], axis=1))([target_emb, context_emb])
    output = tf.keras.layers.Activation("sigmoid")(dot_product)

    model = tf.keras.Model(inputs=[target_ip, context_ip], outputs = output)
    optimizer = tf.keras.optimizers.Adam()

    loss_function = tf.keras.losses.BinaryCrossentropy(from_logits=False) # binary classification
    model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])

    tf.keras.utils.set_random_seed(42)
    model.fit([target_words, context_words], y_train, epochs=epochs, batch_size=batch_size)
    print("Model training complete")

    target_embeddings = model.get_layer("target_embed").get_weights()[0]
    print("The shape of the embedding for target words created is:", target_embeddings.shape)

    context_embeddings = model.get_layer("context_embed").get_weights()[0]
    print("The shape of the embedding for context words created is:", context_embeddings.shape)

    final_embeddings = (target_embeddings + context_embeddings) / 2
    print("Shape of final merged embeddings:", final_embeddings.shape)

    np.save(f"{name}_embeddings.npy", final_embeddings)

    return model, final_embeddings

In [None]:
def load_embeddings(name):
   file_path = f"/content/DSE318-NLP-Assignment-Solutions/Assignment2/embeddings/{name}_embeddings.npy"
   return np.load(file_path, allow_pickle=True).astype(np.float32)

## TRAIN FFNN

In [None]:
def prepare_data_for_FFNN(train, val,word2index):

  X_train = train['Sentence_preprocessed']
  y_train = train['Tag']
  X_val = val['Sentence_preprocessed']
  y_val = val['Tag']
  X_train = [[word2index[word] for word in sentence.split() if word in word2index] for sentence in X_train]
  X_val = [[word2index[word] for word in sentence.split() if word in word2index] for sentence in X_val]
  padlen = max(max(map(len, X_train)), max(map(len, X_val)))
  X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=padlen, padding='post')
  X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=padlen, padding='post')

  return X_train, np.array(y_train), X_val, np.array(y_val)




In [None]:
def ffnn(name,vocab_size,X_train, y_train, epochs = 10, batch_size = 32):

  print("Loading the embeddings..\n")
  embeddings = load_embeddings(name)
  embedding_dim = embeddings.shape[1]

  print("Training the model...\n")
  model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, mask_zero=True,weights=[embeddings], trainable=True),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

  optimizer = tf.keras.optimizers.Adam()
  model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

  class_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
  class_weight_dict = {i: w for i, w in enumerate(class_weights)}


  model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, class_weight=class_weight_dict)

  print("Model training complete!")

  model_filename = f"{name}_model.keras"
  model.save(model_filename)
  print(f"Model saved as {model_filename}")

  return model

## EVALUATE FFNN

In [None]:
def load_model(name):
   model = tf.keras.models.load_model(f"/content/DSE318-NLP-Assignment-Solutions/Assignment2/models/{name}_model.keras")
   print(f"Model {model} loaded successfully!")
   return model

In [None]:
def evaluate(name, word2index, X, y):
  target_names = [name, f"non_{name}"]
  model = load_model(name)
  y_pred_proba = model.predict(X)
  y_pred = (y_pred_proba > 0.4).astype(int)
  print(classification_report(y, y_pred, target_names=target_names))

## HATE DATASET

In [None]:
hate_train, hate_val = load_and_preprocess_data('hate')

In [None]:
hate_vocab, hate_vocab_size, hate_word2index, hate_index2word = form_vocab(hate_train,isdataframe=True)

Vocabulary of 12934 created


In [None]:
hate_target_words_index, hate_positive_samples_index, hate_negative_samples_index = create_target_and_context_index(data= hate_train, word2index=hate_word2index, index2word=hate_index2word,window_size = 4 , k=10, isdataframe = True)

The data is of shape:	 (3660, 3)
Vocabulary of 12934 created
Created the target words and positive and negative context words pair.

Printing an example...
For word at index 2839:	 maarliek
It's positive samples are:	 ['ne', 'k', 'ki', 'mohenjodaro', 'sath', 'bhi', 'hritik', 'hadonone']
It's negative samples are:	 ['agar', 'lost', 'ajeebogareeb', 'ba', 'bana']

Created the target words and context words pair using the index for training.
 
maarliek index:	 2839
Postive samples are:	 [99, 307, 41, 2840, 302, 201, 2838, 2837]
Negative samples are:	 [486, 10076, 10361, 1927, 572]


In [None]:
X_hate_train, y_hate_train = create_training_data(hate_target_words_index, hate_positive_samples_index, hate_negative_samples_index, hate_index2word)

No positive samples for target index 6804: khudaneapnebandokoamankapaigamlekarbhejahaiyarapekalisensdekar
The total number of target words are:	12934.
The total number of positive samples are:	12934.
The total number of negative samples are:	12934

Created data for training.
Shape of training data is (379572, 2)
Printing the head of the training data..

<bound method NDFrame.head of             0      1
0        1248   1250
1        1248   1247
2        1248   1249
3        1248     23
4        1248   1245
...       ...    ...
379567  10582   7327
379568  10582   6453
379569  10582   3002
379570  10582  12222
379571  10582   1213

[379572 rows x 2 columns]>


In [None]:
# # time - 4m
# hate_word2vec_model, hate_embeddings = create_word2vec(X_hate_train, y_hate_train, hate_vocab_size, 'hate', hate_index2word, hate_word2index, epochs=30, batch_size=128)

Epoch 1/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.0085 - loss: 0.5909
Epoch 2/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.0319 - loss: 0.2712
Epoch 3/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.1013 - loss: 0.2277
Epoch 4/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.1342 - loss: 0.1719
Epoch 5/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.1546 - loss: 0.1104
Epoch 6/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.1654 - loss: 0.0606
Epoch 7/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.1869 - loss: 0.0294
Epoch 8/30
[1m2966/2966[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.2165 - loss: 0.0131
Epoch 9/30
[1m2966/2

In [None]:
X_train_hate, y_train_hate, X_val_hate, y_val_hate = prepare_data_for_FFNN(hate_train, hate_val, hate_word2index)


In [None]:
# hate_ffnn_model = ffnn('hate', hate_vocab_size, X_train_hate, y_train_hate,epochs=30)

Loading the embeddings..

Training the model...

Epoch 1/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5576 - loss: 0.6850
Epoch 2/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6274 - loss: 0.6493
Epoch 3/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6901 - loss: 0.6121
Epoch 4/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7393 - loss: 0.5563
Epoch 5/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8030 - loss: 0.4785
Epoch 6/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8564 - loss: 0.3899
Epoch 7/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9042 - loss: 0.3022
Epoch 8/30
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9375 - loss

In [None]:
evaluate('hate', hate_word2index,X_val_hate, y_val_hate)

Model <Sequential name=sequential, built=True> loaded successfully!
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
              precision    recall  f1-score   support

        hate       0.75      0.70      0.72       309
    non_hate       0.45      0.53      0.49       148

    accuracy                           0.64       457
   macro avg       0.60      0.61      0.61       457
weighted avg       0.66      0.64      0.65       457



## SARCASM

In [None]:
sarcasm_train, sarcasm_val = load_and_preprocess_data('sarcasm')

In [None]:
sarcasm_vocab, sarcasm_vocab_size, sarcasm_word2index, sarcasm_index2word = form_vocab(sarcasm_train,isdataframe=True)


Vocabulary of 14559 created


In [None]:
# sarcasm_target_words_index, sarcasm_positive_samples_index, sarcasm_negative_samples_index = create_target_and_context_index(data= sarcasm_train, word2index=sarcasm_word2index, index2word=sarcasm_index2word,window_size = 4 , k=10, isdataframe = True)


The data is of shape:	 (4200, 3)
Vocabulary of 14559 created
Created the target words and positive and negative context words pair.

Printing an example...
For word at index 6766:	 werna
It's positive samples are:	 ['chahiyay', 'honi', 'ko', 'pehun', 'lainn', 'choordiyann', 'un']
It's negative samples are:	 ['tajziyae', 'khelane', 'respect', 'tarekfatah', 'balley']

Created the target words and context words pair using the index for training.
 
werna index:	 6766
Postive samples are:	 [6765, 1340, 55, 6768, 6769, 6767, 2524]
Negative samples are:	 [11822, 7900, 1890, 11354, 10678]


In [None]:
# X_sarcasm_train, y_sarcasm_train = create_training_data(sarcasm_target_words_index, sarcasm_positive_samples_index, sarcasm_negative_samples_index, sarcasm_index2word)


No positive samples for target index 9746: mosadnebdlaliyalakhlakhshukrahaiisrailecricketnahikheltevarnasirfseriesraddhotiaurmamlathandapadjata
The total number of target words are:	14559.
The total number of positive samples are:	14559.
The total number of negative samples are:	14559

Created data for training.
Shape of training data is (411764, 2)
Printing the head of the training data..

<bound method NDFrame.head of            0      1
0       9393    196
1       9393   1919
2       9393   1300
3       9393     56
4       9393    197
...      ...    ...
411759  6142   8928
411760  6142  11446
411761  6142   7078
411762  6142    645
411763  6142  11852

[411764 rows x 2 columns]>


In [None]:
# sarcasm_word2vec_model, sarcasm_embeddings = create_word2vec(X_sarcasm_train, y_sarcasm_train, sarcasm_vocab_size, 'sarcasm', sarcasm_index2word, sarcasm_word2index, epochs=30, batch_size=128)


Epoch 1/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.0079 - loss: 0.5912
Epoch 2/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.0233 - loss: 0.2800
Epoch 3/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.0825 - loss: 0.2343
Epoch 4/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - accuracy: 0.1199 - loss: 0.1741
Epoch 5/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 0.1431 - loss: 0.1089
Epoch 6/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.1549 - loss: 0.0576
Epoch 7/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.1778 - loss: 0.0268
Epoch 8/30
[1m3217/3217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.2069 - loss: 0.0113
Epoch 9/30
[1m32

In [None]:
X_train_sarcasm, y_train_sarcasm, X_val_sarcasm, y_val_sarcasm = prepare_data_for_FFNN(sarcasm_train, sarcasm_val, sarcasm_word2index)


In [None]:
# sarcasm_ffnn_model = ffnn('sarcasm', sarcasm_vocab_size, X_train_sarcasm, y_train_sarcasm,epochs=30)


Loading the embeddings..

Training the model...

Epoch 1/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.7676 - loss: 0.6508
Epoch 2/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9112 - loss: 0.3561
Epoch 3/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9456 - loss: 0.1864
Epoch 4/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9533 - loss: 0.1237
Epoch 5/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9639 - loss: 0.0893
Epoch 6/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9695 - loss: 0.0667
Epoch 7/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9767 - loss: 0.0486
Epoch 8/30
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9852 - los

In [None]:
evaluate('sarcasm', sarcasm_word2index,X_val_sarcasm, y_val_sarcasm)


Model <Sequential name=sequential_1, built=True> loaded successfully!
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step
              precision    recall  f1-score   support

     sarcasm       0.97      0.98      0.98       474
 non_sarcasm       0.79      0.75      0.77        51

    accuracy                           0.96       525
   macro avg       0.88      0.86      0.87       525
weighted avg       0.96      0.96      0.96       525



## Humor

In [None]:
humor_train, humor_val = load_and_preprocess_data('humor')

In [None]:
humor_vocab, humor_vocab_size, humor_word2index, humor_index2word = form_vocab(humor_train,isdataframe=True)


Vocabulary of 7179 created


In [None]:
# humor_target_words_index, humor_positive_samples_index, humor_negative_samples_index = create_target_and_context_index(data= humor_train, word2index=humor_word2index, index2word=humor_index2word,window_size = 4 , k=10, isdataframe = True)

The data is of shape:	 (2360, 3)
Vocabulary of 7179 created
Created the target words and positive and negative context words pair.

Printing an example...
For word at index 5317:	 place
It's positive samples are:	 ['to', 'special', 'in', 'hell', 'best', 'for', 'salman', 'people', 'bhai']
It's negative samples are:	 ['melted', 'ji', 'dekha', 'phorengay', 'bahu']

Created the target words and context words pair using the index for training.
 
place index:	 5317
Postive samples are:	 [30, 3354, 315, 6961, 974, 118, 249, 314, 54]
Negative samples are:	 [4468, 297, 1100, 5710, 149]


In [None]:
# X_humor_train, y_humor_train = create_training_data(humor_target_words_index, humor_positive_samples_index, humor_negative_samples_index, humor_index2word)


The total number of target words are:	7179.
The total number of positive samples are:	7179.
The total number of negative samples are:	7179

Created data for training.
Shape of training data is (178858, 2)
Printing the head of the training data..

<bound method NDFrame.head of            0     1
0       3179  3180
1       3179    23
2       3179    69
3       3179  3181
4       3179    63
...      ...   ...
178853  6736  1540
178854  6736  4342
178855  6736  5337
178856  6736  6242
178857  6736  5248

[178858 rows x 2 columns]>


In [None]:
# humor_word2vec_model, humor_embeddings = create_word2vec(X_humor_train, y_humor_train, humor_vocab_size, 'humor', humor_index2word, humor_word2index, epochs=30, batch_size=128)


Epoch 1/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0031 - loss: 0.6601
Epoch 2/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.0059 - loss: 0.3497
Epoch 3/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.0165 - loss: 0.2903
Epoch 4/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.0351 - loss: 0.2410
Epoch 5/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0618 - loss: 0.1822
Epoch 6/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0782 - loss: 0.1240
Epoch 7/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0885 - loss: 0.0768
Epoch 8/30
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.1061 - loss: 0.0443
Epoch 9/30
[1m1398/1398

In [None]:
X_train_humor, y_train_humor, X_val_humor, y_val_humor = prepare_data_for_FFNN(humor_train, humor_val, humor_word2index)


In [None]:
# humor_ffnn_model = ffnn('humor', humor_vocab_size, X_train_humor, y_train_humor,epochs=30)


Loading the embeddings..

Training the model...

Epoch 1/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.5002 - loss: 0.7059
Epoch 2/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5432 - loss: 0.6784
Epoch 3/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6056 - loss: 0.6553
Epoch 4/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6652 - loss: 0.6230
Epoch 5/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7271 - loss: 0.5747
Epoch 6/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7874 - loss: 0.5113
Epoch 7/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8328 - loss: 0.4325
Epoch 8/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8730 - loss: 0.3441
Epoch 

In [None]:
evaluate('humor', humor_word2index,X_val_humor, y_val_humor)


Model <Sequential name=sequential_2, built=True> loaded successfully!
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step
              precision    recall  f1-score   support

       humor       0.59      0.45      0.51       119
   non_humor       0.68      0.78      0.73       176

    accuracy                           0.65       295
   macro avg       0.63      0.62      0.62       295
weighted avg       0.64      0.65      0.64       295

