# Data Processing - Encoding

Word embeddings are a way to represent words as dense vectors, capturing semantic relationships between words based on their usage in context.  With word embeddings, users can analyze and process text data, perform sentiment analysis, text classification, language translation, and more. Users can easily integrate word embeddings into their NLP workflows, visualizing the relationships between words and exploring the semantic meaning of text data.

In [1]:
#!pip install scikit-learn
#!pip uninstall nltk
#!pip install nltk
#!pip install scipy

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
import nltk
import pandas as pd
import numpy as np
#nltk.download()

In [4]:
# Importing the dataset excel sheet
df = pd.read_excel('tokenized Dataset.xlsx')
df

Unnamed: 0,cleaned_data,word_token,sent_token,subword_token,char_data,Sentiment,Sarcasm
0,One reviewer mention watch 1 Oz episode youll ...,"['One', 'reviewer', 'mention', 'watch', '1', '...",['One reviewer mention watch 1 Oz episode youl...,"['One', 'Ġreviewer', 'Ġmention', 'Ġwatch', 'Ġ1...","['O', 'n', 'e', ' ', 'r', 'e', 'v', 'i', 'e', ...",positive,not sarcastic
1,wonderful little production. filming technique...,"['wonderful', 'little', 'production', '.', 'fi...","['wonderful little production.', 'filming tech...","['w', 'onder', 'ful', 'Ġlittle', 'Ġproduction'...","['w', 'o', 'n', 'd', 'e', 'r', 'f', 'u', 'l', ...",positive,not sarcastic
2,movie groundbreaking experience! Ive never see...,"['movie', 'groundbreaking', 'experience', '!',...","['movie groundbreaking experience!', 'Ive neve...","['movie', 'Ġground', 'breaking', 'Ġexperience'...","['m', 'o', 'v', 'i', 'e', ' ', 'g', 'r', 'o', ...",positive,sarcastic
3,think wonderful way spend time hot summer week...,"['think', 'wonderful', 'way', 'spend', 'time',...",['think wonderful way spend time hot summer we...,"['think', 'Ġwonderful', 'Ġway', 'Ġspend', 'Ġti...","['t', 'h', 'i', 'n', 'k', ' ', 'w', 'o', 'n', ...",positive,not sarcastic
4,Basically there family little boy Jake think t...,"['Basically', 'there', 'family', 'little', 'bo...",['Basically there family little boy Jake think...,"['B', 'as', 'ically', 'Ġthere', 'Ġfamily', 'Ġl...","['B', 'a', 's', 'i', 'c', 'a', 'l', 'l', 'y', ...",negative,sarcastic
...,...,...,...,...,...,...,...
6492,movie idea character development muscle less b...,"['movie', 'idea', 'character', 'development', ...",['movie idea character development muscle less...,"['movie', 'Ġidea', 'Ġcharacter', 'Ġdevelopment...","['m', 'o', 'v', 'i', 'e', ' ', 'i', 'd', 'e', ...",negative,sarcastic
6493,guess run budget decent script.,"['guess', 'run', 'budget', 'decent', 'script',...",['guess run budget decent script.'],"['gu', 'ess', 'Ġrun', 'Ġbudget', 'Ġdecent', 'Ġ...","['g', 'u', 'e', 's', 's', ' ', 'r', 'u', 'n', ...",negative,sarcastic
6494,need plot explosion every five minutes?,"['need', 'plot', 'explosion', 'every', 'five',...",['need plot explosion every five minutes?'],"['need', 'Ġplot', 'Ġexplosion', 'Ġevery', 'Ġfi...","['n', 'e', 'e', 'd', ' ', 'p', 'l', 'o', 't', ...",negative,sarcastic
6495,award generic action movie ever made?,"['award', 'generic', 'action', 'movie', 'ever'...",['award generic action movie ever made?'],"['aw', 'ard', 'Ġgeneric', 'Ġaction', 'Ġmovie',...","['a', 'w', 'a', 'r', 'd', ' ', 'g', 'e', 'n', ...",negative,sarcastic


In [5]:
df['Sentiment'].value_counts()

Sentiment
negative    4184
positive    2300
neutral       13
Name: count, dtype: int64

In [6]:
df['Sarcasm'].value_counts()

Sarcasm
sarcastic        3518
not sarcastic    2979
Name: count, dtype: int64

In [7]:
def random_forest(X_train,Y_train,X_test,Y_test):
    # Train and evaluate the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(Y_test, y_pred)
    conf_matrix = confusion_matrix(Y_test, y_pred)
    class_report = classification_report(Y_test, y_pred)

    print(f"Evaluation for the given vectors:\n")
    print(f'Accuracy: {accuracy:.2f}')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('Classification Report:')
    print(class_report)


# 1) Label Encoder

Label Encoder: Label encoding assigns a unique integer to each word in the vocabulary. While this method provides a compact representation, it does not capture semantic relationships between words.

## Use: 
Assigning a unique integer to each word in the vocabulary, useful for basic text encoding.

# Pros: 
Compact representation, easy to use.

# Cons: 
Does not capture semantic relationships, may not be suitable for advanced NLP tasks.

In [8]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])

mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Mapping of sentiment labels to numerical values:")
for sentiment, label in mapping.items():
    print(f"{sentiment}: {label}")

df['Sentiment']

Mapping of sentiment labels to numerical values:
negative: 0
neutral: 1
positive: 2


0       2
1       2
2       2
3       2
4       0
       ..
6492    0
6493    0
6494    0
6495    0
6496    0
Name: Sentiment, Length: 6497, dtype: int32

In [9]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

df['Sarcasm'] = label_encoder.fit_transform(df['Sarcasm'])

mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Mapping of sarcasm labels to numerical values:")
for sarcasm, label in mapping.items():
    print(f"{sarcasm}: {label}")

df['Sarcasm']


Mapping of sarcasm labels to numerical values:
not sarcastic: 0
sarcastic: 1


0       0
1       0
2       1
3       0
4       1
       ..
6492    1
6493    1
6494    1
6495    1
6496    1
Name: Sarcasm, Length: 6497, dtype: int32

# 2) One Hot encoder

One Hot Encoder: This technique represents each word as a binary vector where all elements are zero except for the index corresponding to the word's position in the vocabulary, which is set to one. This method is simple but results in high-dimensional sparse vectors.

## Use: 
Representing words as binary vectors, useful for simple text classification tasks.

## Pros: 
Simple to implement, preserves the exact word information.

## Cons: 
Results in high-dimensional sparse vectors, does not capture semantic relationships between words.


In [10]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
encoded_df = pd.DataFrame(encoder.fit_transform(df[['Sarcasm']]), columns=encoder.get_feature_names_out(['Sarcasm']))

print("Mapping of Sarcasm labels to one-hot encoded columns:")
for label, encoded_columns in zip(df['Sarcasm'].unique(), encoded_df.values):
    print(f"{label}: {encoded_columns}")
    
print("\nUnique Sarcasm labels:")
print(df['Sarcasm'].unique())


Mapping of Sarcasm labels to one-hot encoded columns:
0: [1. 0.]
1: [1. 0.]

Unique Sarcasm labels:
[0 1]


In [11]:
test = pd.read_excel('test_data.xlsx')
test

In [12]:
train = pd.read_excel('train_data.xlsx')
train

In [13]:
X=df['word_token']
Y=df['Sarcasm']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42
)
train['train_data']=X_train
train['Sarcasm']=y_train

test['test_data']=X_test
test['Sarcasm']=y_test

In [14]:
train

Unnamed: 0,train_data,Sarcasm
1916,"['Ive', 'already', 'see', 'spinoffs', 'cartoon...",0
947,"['probably', 'one', 'bad', 'movie', 'ever', 'm...",1
877,"['Paint', 'number', 'story', 'mediocre', 'act'...",0
2927,"['first', 'murder', 'scene', 'one', 'best', 'm...",0
6063,"['Bravo', 'another', 'movie', 'hero', 'deep', ...",1
...,...,...
3772,"['love', 'movie', 'manage', 'suck', 'joy', 'li...",1
5191,"['Yet', 'another', 'adventure', 'movie', 'prot...",1
5226,"['Yet', 'another', 'forgettable', 'action', 'f...",1
5390,"['Id', 'rather', 'stick', 'elevator', 'mime', ...",1


In [15]:
test

Unnamed: 0,test_data,Sarcasm
3103,"['First', 'Im', 'firefighter', 'Im', 'kind', '...",0
1419,"['Stargate', 'SG1', 'follow', 'intergalactic',...",0
4761,"['Thank', 'Hollywood', 'yet', 'another', 'come...",1
4690,"['Wow', 'another', 'comedy', 'movie', 'recycle...",1
4032,"['mesmerize', 'exploration', 'human', 'conditi...",0
...,...,...
889,"['fully', 'aware', 'statistical', 'data', 'rea...",0
2850,"['think', 'id', 'check', 'film', 'Im', 'curren...",0
4917,"['Bravo', '!', 'Another', 'comedy', 'leave', '...",1
5198,"['watch', 'privileged', 'people', 'travel', 'e...",1


## 3) Word2Vec
Word2Vec: Word2Vec is a popular word embedding technique that learns continuous representations of words based on their context in a large corpus of text. It captures semantic relationships between words and is able to represent words with similar meanings as vectors that are close to each other in the embedding space.

# Use: 
Learning continuous word representations based on context, useful for capturing semantic relationships between words.

## Pros: 
Captures complex word relationships, provides dense and relatively low-dimensional embeddings.

## Cons: 
Requires a large amount of training data, may not work well for rare words.

In [16]:
from gensim.models import Word2Vec
import gensim

# Create CBOW model
word_cbow = gensim.models.Word2Vec(train['train_data'], min_count=1, vector_size=100, window=5)

# Create Skip Gram model
word_skip = gensim.models.Word2Vec(train['train_data'], min_count=1, vector_size=100, window=5, sg=1)

def average_word_vectors(word_vectors, vector_size=100):
    valid_vectors = [vec for vec in word_vectors if not np.isnan(vec).any()]
    if not valid_vectors:
        return np.zeros(vector_size) 
    else:
        return np.mean(valid_vectors, axis=0)


train['cbow_vectors'] = train['train_data'].apply(lambda x: average_word_vectors([word_cbow.wv[word] for word in x if word in word_cbow.wv]))
test['cbow_vectors'] = test['test_data'].apply(lambda x: average_word_vectors([word_cbow.wv[word] for word in x if word in word_cbow.wv]))

train['skip_vectors'] = train['train_data'].apply(lambda x: average_word_vectors([word_skip.wv[word] for word in x if word in word_skip.wv]))
test['skip_vectors'] =test['test_data'].apply(lambda x: average_word_vectors([word_skip.wv[word] for word in x if word in word_skip.wv]))


In [36]:
train['cbow_vectors']

1916    [0.013314046, -0.44059646, -0.22429062, -0.594...
947     [-0.00744503, -0.44556785, -0.21564841, -0.613...
877     [-0.031009361, -0.44244248, -0.228988, -0.5941...
2927    [-0.0054936158, -0.3918013, -0.20509245, -0.56...
6063    [-0.041897837, -0.3942514, -0.22329089, -0.573...
                              ...                        
3772    [-0.019697953, -0.46055502, -0.23612061, -0.58...
5191    [-0.054630097, -0.41608295, -0.22733516, -0.57...
5226    [-0.034282967, -0.43000963, -0.23234665, -0.57...
5390    [-0.053429805, -0.39407605, -0.23116787, -0.55...
860     [0.013468297, -0.42515242, -0.22746369, -0.612...
Name: cbow_vectors, Length: 5197, dtype: object

In [37]:
random_forest(list(train['cbow_vectors']),train['Sarcasm'],list(test['cbow_vectors']),test['Sarcasm'])

Evaluation for the given vectors:

Accuracy: 0.77
Confusion Matrix:
[[472 132]
 [161 535]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.78      0.76       604
           1       0.80      0.77      0.79       696

    accuracy                           0.77      1300
   macro avg       0.77      0.78      0.77      1300
weighted avg       0.78      0.77      0.77      1300



In [38]:
train['skip_vectors']

1916    [-0.08790181, 0.13337336, 0.20003186, 0.057840...
947     [-0.091433086, 0.1359427, 0.20129208, 0.058086...
877     [-0.085398145, 0.13671964, 0.1906442, 0.051597...
2927    [-0.0883963, 0.1333404, 0.1942192, 0.05912442,...
6063    [-0.08794053, 0.14131702, 0.19568387, 0.059249...
                              ...                        
3772    [-0.09147534, 0.1413895, 0.19707988, 0.0596708...
5191    [-0.07994005, 0.13224454, 0.19289885, 0.049859...
5226    [-0.08860574, 0.13400672, 0.1944464, 0.0613590...
5390    [-0.09076279, 0.14993809, 0.19454056, 0.072606...
860     [-0.08633483, 0.13501793, 0.19534539, 0.056610...
Name: skip_vectors, Length: 5197, dtype: object

In [39]:
random_forest(list(train['skip_vectors']), train['Sarcasm'],list(test['skip_vectors']),test['Sarcasm'])

Evaluation for the given vectors:

Accuracy: 0.79
Confusion Matrix:
[[498 106]
 [164 532]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.79       604
           1       0.83      0.76      0.80       696

    accuracy                           0.79      1300
   macro avg       0.79      0.79      0.79      1300
weighted avg       0.80      0.79      0.79      1300



# 4) TF-IDF
TF-IDF (Term Frequency-Inverse Document Frequency): TF-IDF represents each word based on its frequency in a document relative to its frequency in the entire corpus. It helps to identify the importance of words in a document.

## Use: 
Representing words based on their importance in a document relative to a corpus, useful for information retrieval and text mining tasks.

## Pros: 
Captures word importance, handles common words well.

## Cons: 
Does not capture word semantics directly, may require tuning for optimal performance.

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy(tokens):
    return tokens

def vectorizer_idf(reviews, vectorizer=None):
    vectorizer = TfidfVectorizer(tokenizer=dummy, preprocessor=dummy)
    X = vectorizer.fit_transform(reviews)
    return X, vectorizer

vectorized_idf, vectorizer = vectorizer_idf(train['train_data'])
train['idf_vector'] = list(vectorized_idf.toarray())

vectorized_test_idf = vectorizer.transform(test['test_data'])
test['idf_vector'] = list(vectorized_test_idf.toarray())




In [40]:
train['idf_vector']

1916    [0.35064283887050296, 0.0, 0.7087461636744208,...
947     [0.368280708933376, 0.0, 0.744077350702127, 0....
877     [0.3388110731482772, 0.005159675648729211, 0.6...
2927    [0.3352069005162684, 0.0, 0.6895684810620378, ...
6063    [0.3206714645962336, 0.0, 0.7215107953415255, ...
                              ...                        
3772    [0.3371482651873016, 0.0, 0.7355962149541126, ...
5191    [0.30734298674525307, 0.0, 0.6829844149894513,...
5226    [0.29527348491322486, 0.0, 0.6889714647975247,...
5390    [0.31039716335284884, 0.0, 0.7449531920468372,...
860     [0.3529982794775053, 0.0, 0.7076084232448622, ...
Name: idf_vector, Length: 5197, dtype: object

In [41]:
random_forest(list(train['idf_vector']),train['Sarcasm'],list(test['idf_vector']),test['Sarcasm'])

Evaluation for the given vectors:

Accuracy: 0.80
Confusion Matrix:
[[509  95]
 [167 529]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.84      0.80       604
           1       0.85      0.76      0.80       696

    accuracy                           0.80      1300
   macro avg       0.80      0.80      0.80      1300
weighted avg       0.80      0.80      0.80      1300



In [46]:
train[['train_data','Sarcasm',"cbow_vectors","skip_vectors","idf_vector"]].to_excel('train_data.xlsx', index=False)
train

Unnamed: 0,train_data,Sarcasm,cbow_vectors,skip_vectors,idf_vector
1916,"['Ive', 'already', 'see', 'spinoffs', 'cartoon...",0,"[0.013314046, -0.44059646, -0.22429062, -0.594...","[-0.08790181, 0.13337336, 0.20003186, 0.057840...","[0.35064283887050296, 0.0, 0.7087461636744208,..."
947,"['probably', 'one', 'bad', 'movie', 'ever', 'm...",1,"[-0.00744503, -0.44556785, -0.21564841, -0.613...","[-0.091433086, 0.1359427, 0.20129208, 0.058086...","[0.368280708933376, 0.0, 0.744077350702127, 0...."
877,"['Paint', 'number', 'story', 'mediocre', 'act'...",0,"[-0.031009361, -0.44244248, -0.228988, -0.5941...","[-0.085398145, 0.13671964, 0.1906442, 0.051597...","[0.3388110731482772, 0.005159675648729211, 0.6..."
2927,"['first', 'murder', 'scene', 'one', 'best', 'm...",0,"[-0.0054936158, -0.3918013, -0.20509245, -0.56...","[-0.0883963, 0.1333404, 0.1942192, 0.05912442,...","[0.3352069005162684, 0.0, 0.6895684810620378, ..."
6063,"['Bravo', 'another', 'movie', 'hero', 'deep', ...",1,"[-0.041897837, -0.3942514, -0.22329089, -0.573...","[-0.08794053, 0.14131702, 0.19568387, 0.059249...","[0.3206714645962336, 0.0, 0.7215107953415255, ..."
...,...,...,...,...,...
3772,"['love', 'movie', 'manage', 'suck', 'joy', 'li...",1,"[-0.019697953, -0.46055502, -0.23612061, -0.58...","[-0.09147534, 0.1413895, 0.19707988, 0.0596708...","[0.3371482651873016, 0.0, 0.7355962149541126, ..."
5191,"['Yet', 'another', 'adventure', 'movie', 'prot...",1,"[-0.054630097, -0.41608295, -0.22733516, -0.57...","[-0.07994005, 0.13224454, 0.19289885, 0.049859...","[0.30734298674525307, 0.0, 0.6829844149894513,..."
5226,"['Yet', 'another', 'forgettable', 'action', 'f...",1,"[-0.034282967, -0.43000963, -0.23234665, -0.57...","[-0.08860574, 0.13400672, 0.1944464, 0.0613590...","[0.29527348491322486, 0.0, 0.6889714647975247,..."
5390,"['Id', 'rather', 'stick', 'elevator', 'mime', ...",1,"[-0.053429805, -0.39407605, -0.23116787, -0.55...","[-0.09076279, 0.14993809, 0.19454056, 0.072606...","[0.31039716335284884, 0.0, 0.7449531920468372,..."


In [47]:
test[['test_data','Sarcasm',"cbow_vectors","skip_vectors","idf_vector"]].to_excel('test_data.xlsx', index=False)
test

Unnamed: 0,test_data,Sarcasm,cbow_vectors,skip_vectors,idf_vector
3103,"['First', 'Im', 'firefighter', 'Im', 'kind', '...",0,"[-0.010372605, -0.4567238, -0.22420675, -0.601...","[-0.08752405, 0.13397478, 0.1981876, 0.0568917...","[0.36049630600235416, 0.0, 0.7245442997485739,..."
1419,"['Stargate', 'SG1', 'follow', 'intergalactic',...",0,"[-0.01640415, -0.42824283, -0.22872612, -0.591...","[-0.086026564, 0.13662885, 0.19442646, 0.05281...","[0.3469430496512779, 0.0, 0.6961100932105769, ..."
4761,"['Thank', 'Hollywood', 'yet', 'another', 'come...",1,"[0.013093409, -0.40319425, -0.19710416, -0.573...","[-0.083819784, 0.13904773, 0.19411542, 0.06791...","[0.32716079103618195, 0.0, 0.7046540114625458,..."
4690,"['Wow', 'another', 'comedy', 'movie', 'recycle...",1,"[-0.013250855, -0.41309604, -0.22917916, -0.58...","[-0.09016309, 0.13978377, 0.19731839, 0.054263...","[0.32695772882490065, 0.0, 0.7006237046247871,..."
4032,"['mesmerize', 'exploration', 'human', 'conditi...",0,"[-0.060444143, -0.40848058, -0.24205366, -0.56...","[-0.0845619, 0.14551754, 0.18676989, 0.0542966...","[0.2973630975888576, 0.0, 0.6541988146954868, ..."
...,...,...,...,...,...
889,"['fully', 'aware', 'statistical', 'data', 'rea...",0,"[-0.0037335956, -0.4461988, -0.22805376, -0.60...","[-0.08555071, 0.13806508, 0.19561169, 0.056104...","[0.35472575920999566, 0.0, 0.712496374722223, ..."
2850,"['think', 'id', 'check', 'film', 'Im', 'curren...",0,"[-0.023508983, -0.45118797, -0.23361883, -0.59...","[-0.0873381, 0.13907409, 0.19539632, 0.0538376...","[0.3557137497276089, 0.0, 0.7146909283518013, ..."
4917,"['Bravo', '!', 'Another', 'comedy', 'leave', '...",1,"[-0.0036038794, -0.41219482, -0.21283714, -0.5...","[-0.09664625, 0.14576766, 0.19806981, 0.053484...","[0.32585506917624835, 0.08336784378677009, 0.7..."
5198,"['watch', 'privileged', 'people', 'travel', 'e...",1,"[-0.058767185, -0.4177433, -0.19625926, -0.573...","[-0.091859296, 0.14635305, 0.19235623, 0.05341...","[0.32020592598212944, 0.0, 0.6986311112337369,..."
