<a href="https://colab.research.google.com/gist/priyanshusharma16/976ffa987a6063b46cf7730ed15858c0/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load data
data = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines=True)

In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [None]:
data.shape

(28619, 3)

In [None]:
#checking for null values in data
data.isnull().sum()

is_sarcastic    0
headline        0
article_link    0
dtype: int64

In [None]:
data.is_sarcastic.value_counts()

is_sarcastic
0    14985
1    13634
Name: count, dtype: int64

In [None]:
#checking for duplicate values
data['headline'].duplicated().sum()

116

In [None]:
# Drop duplicate headlines
data = data.drop(data[data['headline'].duplicated()].index, axis=0)

In [None]:
#rechecking for duplicate values
data['headline'].duplicated().sum()

0

In [None]:
# Drop unnecessary columns
data = data.drop(columns=['article_link'])

In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [None]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

#Removing the stopwords from text
def split_into_words(text):
    # split into words by white space
    words = text.split()
    return words

def to_lower_case(words):
    # convert to lower case
    words = [word.lower() for word in words]
    return words

def remove_punctuation(words):
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    stripped = [re_punc.sub('', w) for w in words]
    return stripped

def keep_alphabetic(words):
    # remove remaining tokens that are not alphabetic
    words = [word for word in words if word.isalpha()]
    return words

def remove_stopwords(words):
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words

def to_sentence(words):
    # join words to a sentence
    return ' '.join(words)

#Removing the noisy text
def denoise_text(text):
    words = split_into_words(text)
    words = to_lower_case(words)
    words = remove_punctuation(words)
    words = keep_alphabetic(words)
    words = remove_stopwords(words)
    return to_sentence(words)

In [None]:
# Apply text cleaning
data['news_headline'] = data['headline'].apply(denoise_text)

In [None]:
data.head()

Unnamed: 0,is_sarcastic,headline,news_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep totally nails congress falling short g...
2,0,eat your veggies: 9 deliciously different recipes,eat veggies deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close using word streaming...


In [None]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['news_headline'], data['is_sarcastic'], test_size=0.20, random_state=42)

In [None]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [None]:
# Padding sequences
max_length = max([len(x) for x in train_sequences])  # Find the maximum sequence length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [None]:
# Train Word2Vec model
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in train_data]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Prepare embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

In [None]:
# Convert sequences to Word2Vec embeddings
def convert_to_word2vec(sequences, embedding_matrix, max_length):
    word2vec_sequences = np.zeros((len(sequences), max_length, embedding_dim))
    for i, sequence in enumerate(sequences):
        for j, word_index in enumerate(sequence):
            if word_index != 0:  # Skip padding
                word_vec = embedding_matrix[word_index]
                word2vec_sequences[i, j] = word_vec
    return word2vec_sequences

In [None]:
train_word2vec = convert_to_word2vec(train_padded, embedding_matrix, max_length)
test_word2vec = convert_to_word2vec(test_padded, embedding_matrix, max_length)

In [None]:
# Inspect Word2Vec encoded sequences
print("Shape of train_word2vec:", train_word2vec.shape)
print("Shape of test_word2vec:", test_word2vec.shape)
print("Sample Word2Vec encoded train sequence:\n", train_word2vec[0])
print("Sample Word2Vec encoded test sequence:\n", test_word2vec[0])

Shape of train_word2vec: (22802, 106, 100)
Shape of test_word2vec: (5701, 106, 100)
Sample Word2Vec encoded train sequence:
 [[-0.72729731  1.41205347  0.6908955  ... -1.16739058  0.14696582
   0.66753608]
 [-0.19192421  0.36437336  0.18654716 ... -0.30756003  0.04675481
   0.16850248]
 [-0.02013207  0.05818575  0.03467382 ... -0.04615525 -0.00236784
   0.02701318]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
Sample Word2Vec encoded test sequence:
 [[-0.04658628  0.0851171   0.04052103 ... -0.06102676  0.01058459
   0.02753298]
 [-0.06167693  0.11658745  0.04834437 ... -0.09984828  0.02152216
   0.04867287]
 [-0.00674185 -0.00117609  0.00085781 ... -0.00459011  0.00280366
   0.00420339]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0. 