# Importing Required Libraries

In [None]:
from bs4 import BeautifulSoup
import re,string,unicodedata
from string import punctuation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from keras.preprocessing import text, sequence


import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,GRU
import tensorflow as tf


from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split

# Loading dataset

In [None]:
df = pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json", lines=True)
df.head()

Dropping link of article

In [None]:
df = df.drop('article_link',axis = 1)
df.head()

**Checking for nulls**

In [None]:
df.isna().sum()

**Plot to check class imbalance**

In [None]:
sns.countplot(data=df, x="is_sarcastic")

# Text Preprocessing

Step by step -

1. `stopwords.words('english')`: This line of code imports a list of English stopwords from the NLTK (Natural Language Toolkit) library. Stopwords are common words like "the," "and," "is," "in," etc., that are often removed from text data because they don't carry significant meaning in many NLP tasks.

2. `set(stopwords.words('english'))`: The stopwords are loaded into a Python set data structure. Using a set is efficient for checking whether a word is a stopword or not because it allows for fast membership tests.

3. `string.punctuation`: This line imports a string containing all the common punctuation marks in English, such as ".", ",", "!", "?", etc.

3. `list(string.punctuation)`: The punctuation marks are converted into a list. 

4. `stop.update(punctuation)`: Finally, the punctuation marks are added to the set of stopwords. This step ensures that both stopwords and punctuation marks are combined into a single set for later use in text preprocessing.

After executing this code, the stop set will contain both English stopwords and punctuation marks.

In [None]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

Function to take input containing HTML and uses BS to remove HTML tags and return plain text.

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

Removal of the square brackets

In [None]:
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

Removal of the URLs

In [None]:
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)

Removal of the stopwords from text

In [None]:
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

Cleaning the data

In [None]:
def cleanse_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

**Apply function on review column**

In [None]:
%time
df['headline']=df['headline'].apply(cleanse_text)

# EDA

**WordCloud for Text that is Not Sarcastic**

In [None]:
plt.figure(figsize = (12,10)) 
plt.title("WordCloud for Not Sarcastic")
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.is_sarcastic == 0].headline))
plt.imshow(wc , interpolation = 'bilinear')

**WordCloud for Text that is Sarcastic**

In [None]:
plt.figure(figsize = (12,10)) 
plt.title("WordCloud for Sarcastic")
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.is_sarcastic == 1].headline))
plt.imshow(wc , interpolation = 'bilinear')

**Comparing ccharacters in Text classes**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
text_len=df[df['is_sarcastic']==1]['headline'].str.len()
ax1.hist(text_len,color='red')
ax1.set_title('Sarcastic text')
text_len=df[df['is_sarcastic']==0]['headline'].str.len()
ax2.hist(text_len,color='green')
ax2.set_title('Not Sarcastic text')
fig.suptitle('Characters in text classes')
plt.show()

**Comparing Words in Text classes**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
text_len=df[df['is_sarcastic']==1]['headline'].str.split().map(lambda x: len(x))
ax1.hist(text_len,color='red')
ax1.set_title('Sarcastic text')
text_len=df[df['is_sarcastic']==0]['headline'].str.split().map(lambda x: len(x))
ax2.hist(text_len,color='green')
ax2.set_title('Not Sarcastic text')
fig.suptitle('Words in text classes')
plt.show()

**Comparing Avg Word Length in Each Text Class**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(20,10))
word=df[df['is_sarcastic']==1]['headline'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('Sarcastic text')
word=df[df['is_sarcastic']==0]['headline'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Not Sarcastic text')
fig.suptitle('Average word length in each text class')

---

# Introduction to Word2Vec

Reformatting text to format compatible gensim library

In [None]:
words = []
for i in df.headline.values:
    words.append(i.split())
words[:5]

## Deciding on Dimensions of Embedding Vectors

The dimensions of embedding vectors in NLP tasks are a **hyperparameter** that you need to decide before training your NN. The choice of embedding dimensions can have an impact on the performance of your NLP model, and there's no one-size-fits-all answer. Here are some factors and guidelines to consider when deciding the dimensions of embedding vectors:

1. **Size of Vocabulary**: The size of your vocabulary, which is the total number of unique words in your dataset, can influence the choice of embedding dimensions. If you have a relatively small vocabulary, you might choose smaller embedding dimensions (e.g., 50, 100). For larger vocabularies, you may opt for larger dimensions (e.g., 200, 300).

2. **Amount of Training Data**: The amount of training data you have also plays a role. With more training data, you might be able to use larger embedding dimensions because the model has more examples to learn meaningful representations. Conversely, with limited data, you may want to keep the dimensions smaller to prevent overfitting.

3. **Task Specificity**: The specific NLP task you are working on can influence the choice of embedding dimensions. For example:
    For tasks like sentiment analysis or text classification, embeddings in the range of 100-300 dimensions are common.
    For more complex tasks like machine translation or language modeling, larger embeddings (300+ dimensions) may be beneficial.

4. **Pretrained Embeddings**: If you plan to use pretrained word embeddings like Word2Vec, GloVe, or pre-trained embeddings from models like BERT, you should use the same dimensions as the pretrained embeddings to facilitate transfer learning.

5. **Model Architecture**: The choice of model architecture can also affect the embedding dimensions. Some architectures work better with certain embedding dimensions. For example, convolutional neural networks (CNNs) for text classification might benefit from smaller embeddings, while recurrent neural networks (RNNs) or transformers can handle larger embeddings effectively.

6. **Computational Resources**: Consider the computational resources available to you. Larger embedding dimensions require more memory and computational power. Ensure that your hardware can handle the chosen dimensions.

7. **Experimentation**: It's often a good practice to experiment with different embedding dimensions and evaluate their impact on your specific NLP task using validation data. You can perform hyperparameter tuning to find the best embedding dimension for your model's performance.

8. **Visualization**: In some cases, you may want to reduce the dimensions of embeddings (e.g., using techniques like t-SNE or PCA) for visualization and exploration, even if you use larger embeddings for model training.

In summary, there is no fixed rule for choosing the dimensions of embedding vectors, and it often depends on your specific NLP task, data, and available resources. It's a hyperparameter that should be tuned and experimented with to find the best configuration for your particular use case.

The Dimension of the vectors we are attempting to generate

In [None]:
import gensim
EMBEDDING_DIM = 200

Creating Word Vectors using Word2Vec

In [None]:
%time
w2v_model = gensim.models.Word2Vec(sentences = words , vector_size=EMBEDDING_DIM , window = 5 , min_count = 1)

Size of vocabulary

In [None]:
len(w2v_model.wv.index_to_key)

represented each of 38071 words by a 100dim vector.

In [None]:
from keras.utils import pad_sequences

Step by Step -

1. `tokenizer = text.Tokenizer(num_words=35000)`: This line initializes a Tokenizer object with a maximum vocabulary size of 35,000 words. The num_words parameter specifies the maximum number of words to keep in the vocabulary, based on word frequency.

2. `tokenizer.fit_on_texts(words)`: This line fits the tokenizer on a list or array of text data called words. During this process, the Tokenizer object learns the vocabulary and assigns a unique integer index to each word in the text data. This is typically a preprocessing step before training a machine learning model.

3. `tokenized_train = tokenizer.texts_to_sequences(words)`: This line converts the text data in the words list into sequences of integers. Each word in the input text is replaced with its corresponding integer index from the tokenizer's vocabulary. The result is stored in the tokenized_train variable.

4. `x = pad_sequences(tokenized_train, maxlen=20)`: This line takes the tokenized sequences in tokenized_train and pads or truncates them to ensure that they all have a fixed length of 20. Padding is added to sequences that are shorter than 20 words, and sequences longer than 20 words are truncated. The resulting x variable contains the padded/truncated sequences


In [None]:
tokenizer = text.Tokenizer(num_words=35000)
tokenizer.fit_on_texts(words)
tokenized_train = tokenizer.texts_to_sequences(words)
x = pad_sequences(tokenized_train, maxlen = 20)

Adding 1 because of reserved 0 index

Embedding Layer creates one more vector for "UNKNOWN" words, or padded words (0s). This Vector is filled with zeros.

Thus our vocab size inceeases by 1

In [None]:
vocab_size = len(tokenizer.word_index) + 1

Creating a function to create weight matrix from word2vec gensim model

In [None]:
def get_weight_matrix(model, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # defining the weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = model.wv[word]
    return weight_matrix

Getting embedding vectors from word2vec and usings it as weights of non-trainable keras embedding layer

In [None]:
embedding_vectors = get_weight_matrix(w2v_model, tokenizer.word_index)

---

# Training the Word2Vec Model

**Model Creation**

In [None]:
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=20, trainable=True))
#LSTM 
model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.3 , dropout = 0.3,return_sequences = True)))
model.add(Bidirectional(GRU(units=32 , recurrent_dropout = 0.1 , dropout = 0.1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['acc'])

del embedding_vectors

In [None]:
model.summary()

**Splitting Data**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, df.is_sarcastic , test_size = 0.3 , random_state = 0) 

**Model Fitting**

In [None]:
history = model.fit(x_train, y_train, batch_size = 128 , validation_data = (x_test,y_test) , epochs = 10)

**Printing Model Performance**

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train)[1]*100)
print("Accuracy of the model on Testing Data is - " , model.evaluate(x_test,y_test)[1]*100)

**Plotting Model Performance**

In [None]:
epochs = [i for i in range(10)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['acc']
train_loss = history.history['loss']
val_acc = history.history['val_acc']
val_loss = history.history['val_loss']
fig.set_size_inches(20,10)

ax[0].plot(epochs , train_acc , 'go-' , label = 'Training Accuracy')
ax[0].plot(epochs , val_acc , 'ro-' , label = 'Testing Accuracy')
ax[0].set_title('Training & Testing Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")

ax[1].plot(epochs , train_loss , 'go-' , label = 'Training Loss')
ax[1].plot(epochs , val_loss , 'ro-' , label = 'Testing Loss')
ax[1].set_title('Training & Testing Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")
plt.show()

# Inference -

**Looks like our Model is Overfitting**

In [None]:
pred = model.predict(x_test) 
pred =np.argmax(pred,axis=1)

pred[:5]

**Plotting Confusion Matrix**

In [None]:
cm = confusion_matrix(y_test,pred)
cm = pd.DataFrame(cm , index = ['Not Sarcastic','Sarcastic'] , columns = ['Not Sarcastic','Sarcastic'])
plt.figure(figsize = (10,10))
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Not Sarcastic','Sarcastic'] , yticklabels = ['Not Sarcastic','Sarcastic'])

----

Use one of the following datasets for Performing - 

https://www.kaggle.com/datasets/nitin194/twitter-sentiment-analysis?select=train_E6oV3lV.csv

or

https://www.kaggle.com/datasets/sunnysai12345/news-summary

---

#### Read and implement GloVe and FastText

---

### Research and Implement : FastText developed by Facebook

---