<a href="https://colab.research.google.com/gist/priyanshusharma16/c16e7f9101c487a7349b3be3d9dd10f1/one-hot-encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Load data
data = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines=True)

In [5]:
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [7]:
data.shape

(28619, 3)

In [8]:
#checking for null values in train data
data.isnull().sum()

is_sarcastic    0
headline        0
article_link    0
dtype: int64

In [9]:
data.is_sarcastic.value_counts()

is_sarcastic
0    14985
1    13634
Name: count, dtype: int64

In [10]:
#checking for duplicate values
data['headline'].duplicated().sum()

116

In [11]:
# Drop duplicate headlines
data = data.drop(data[data['headline'].duplicated()].index, axis=0)

In [12]:
#rechecking for duplicate values
data['headline'].duplicated().sum()

0

In [13]:
# Drop unnecessary columns
data = data.drop(columns=['article_link'])

In [14]:
data.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [15]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

#Removing the stopwords from text
def split_into_words(text):
    # split into words by white space
    words = text.split()
    return words

def to_lower_case(words):
    # convert to lower case
    words = [word.lower() for word in words]
    return words

def remove_punctuation(words):
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    stripped = [re_punc.sub('', w) for w in words]
    return stripped

def keep_alphabetic(words):
    # remove remaining tokens that are not alphabetic
    words = [word for word in words if word.isalpha()]
    return words

def remove_stopwords(words):
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words

def to_sentence(words):
    # join words to a sentence
    return ' '.join(words)

#Removing the noisy text
def denoise_text(text):
    words = split_into_words(text)
    words = to_lower_case(words)
    words = remove_punctuation(words)
    words = keep_alphabetic(words)
    words = remove_stopwords(words)
    return to_sentence(words)

In [16]:
# Apply text cleaning
data['news_headline'] = data['headline'].apply(denoise_text)

In [16]:
data.head()

Unnamed: 0,is_sarcastic,headline,news_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep totally nails congress falling short g...
2,0,eat your veggies: 9 deliciously different recipes,eat veggies deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close using word streaming...


In [17]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['news_headline'], data['is_sarcastic'], test_size=0.20, random_state=42)

In [18]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data)
vocab_size = len(tokenizer.word_index) + 1

In [19]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [20]:
data.head()

Unnamed: 0,is_sarcastic,headline,news_headline
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep totally nails congress falling short g...
2,0,eat your veggies: 9 deliciously different recipes,eat veggies deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close using word streaming...


In [21]:
# Padding sequences
max_length = max([len(x) for x in train_sequences])  # Find the maximum sequence length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [22]:
# Inspect padded sequences
print("Sample padded train sequence:\n", train_padded[0])

Sample padded train sequence:
 [   1  168 3003 3986 4515 9420 3276 1671  422  555  278  113    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [23]:
print("Sample padded test sequence:\n", test_padded[0])

Sample padded test sequence:
 [1724  932 8336   15  282 2252   34  216 4849    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [24]:
# Print dimensions of padded sequences
print(f'shape of padded sequences: {train_padded.shape}')

shape of padded sequences: (22802, 106)


In [25]:
# One-hot encoding
train_one_hot = np.zeros((len(train_padded), max_length, vocab_size), dtype=np.float32)
for i, seq in enumerate(train_padded):
    for j, index in enumerate(seq):
        if index != 0:  # Skip padding
            train_one_hot[i, j, index] = 1.0

test_one_hot = np.zeros((len(test_padded), max_length, vocab_size), dtype=np.float32)
for i, seq in enumerate(test_padded):
    for j, index in enumerate(seq):
        if index != 0:  # Skip padding
            test_one_hot[i, j, index] = 1.0

In [26]:
# Inspect one-hot encoded sequences
print("Shape of train_one_hot:", train_one_hot.shape)  #Prints the shape of the one-hot encoded training set to verify the dimensions.
print("Shape of test_one_hot:", test_one_hot.shape)    #Prints the shape of the one-hot encoded test set to verify the dimensions.
print("Sample one-hot encoded train sequence:\n", train_one_hot[0])     #Prints the first one-hot encoded sequence in the training set.
print("Sample one-hot encoded test sequence:\n", test_one_hot[0])       #Prints the first one-hot encoded sequence in the test set.

#The first dimension corresponds to the number of sequences.
#The second dimension corresponds to the positions within each sequence (up to max_length).
#The third dimension corresponds to the vocabulary, with each position in the sequence having a one-hot encoded vector representing the word at that position.


Shape of train_one_hot: (22802, 106, 25734)
Shape of test_one_hot: (5701, 106, 25734)
Sample one-hot encoded train sequence:
 [[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Sample one-hot encoded test sequence:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
