In [2]:
import pandas as pd
dataset_path = "/Users/jashanjeetsingh/Downloads/train-balanced-sarcasm.csv"  
df = pd.read_csv(dataset_path)

In [4]:
from transformers import AutoTokenizer

# Initializing the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example of tokenizing a single comment
text_sample = df.loc[0, 'comment']
tokens = tokenizer.tokenize(text_sample)
print("Transformers Word Tokens:", tokens)


Transformers Word Tokens: ['nc', 'and', 'nh', '.']


In [5]:
# INTEGER ENCODING: Encode the tokens into integer IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)


Token IDs: [13316, 1998, 18699, 1012]


In [6]:
#ONE HOT ENCODING:
from sklearn.preprocessing import OneHotEncoder

# Example: Encoding subreddit as categorical data
subreddit_data = df['subreddit'].values.reshape(-1, 1)

encoder = OneHotEncoder()
encoded_subreddit = encoder.fit_transform(subreddit_data)

print("One-Hot Encoded Subreddit Shape:", encoded_subreddit.shape)
print("Example One-Hot Encoded Subreddit Vector:\n", encoded_subreddit.toarray()[0])


One-Hot Encoded Subreddit Shape: (1010826, 14878)
Example One-Hot Encoded Subreddit Vector:
 [0. 0. 0. ... 0. 0. 0.]


In [7]:
#LABEL ENCODING:
from sklearn.preprocessing import LabelEncoder

# Example: Encoding label 'label'
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['label'])

print("Encoded Labels Shape:", encoded_labels.shape)
print("Example Encoded Label:", encoded_labels[0])


Encoded Labels Shape: (1010826,)
Example Encoded Label: 0


In [8]:
#TF-IDF:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example: TF-IDF on 'comment' column
corpus = df['comment'].astype(str).tolist()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
print("Example TF-IDF Vector:", tfidf_matrix[0])


TF-IDF Matrix Shape: (1010826, 167435)
Example TF-IDF Vector:   (0, 102779)	0.750805316403161
  (0, 11906)	0.1888870658989324
  (0, 101553)	0.6329400075811623


In [13]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Initialize CountVectorizer to compute term frequencies
vectorizer = CountVectorizer()

# Fit the vectorizer on 'comment' column and transform the data
tf_matrix = vectorizer.fit_transform(df['comment'])

# Print term frequencies for the first comment
terms = vectorizer.get_feature_names_out()
term_frequencies = tf_matrix[0].toarray().flatten()
print("Term Frequencies for the first comment:")
for term, freq in zip(terms, term_frequencies):
    print(f"{term}: {freq}")


Term Frequencies for the first comment:
00: 0
000: 0
0000: 0
000000: 0
00000000000000: 0
000000000000000000000000000000000001: 0
000000000000001: 0
000000000000438385: 0
0000000001: 0
0000000013: 0
00000001: 0
00000002: 0
0000001: 0
0000009: 0
000000957: 0
000001: 0
000002: 0
0000034: 0
00000385802: 0
000004: 0
00000991: 0
00001: 0
00001001: 0
00001011: 0
000011: 0
00002: 0
00003: 0
0000488: 0
00005: 0
000064: 0
0001: 0
00010001: 0
00010101: 0
00011001: 0
00011011: 0
00011101: 0
00017: 0
0001p: 0
0002: 0
0003: 0
000372025: 0
000378788: 0
00039be4: 0
0004: 0
00043: 0
000481871: 0
0005: 0
0006: 0
00096: 0
000c: 0
000cc: 0
000fps: 0
000ft: 0
000ge: 0
000hz: 0
000ish: 0
000k: 0
000km: 0
000kn: 0
000lb: 0
000ls: 0
000m: 0
000mb: 0
000mph: 0
000rpm: 0
000s: 0
000th: 0
000v: 0
000webhost: 0
000x: 0
000x10: 0
001: 0
00100000: 0
00100001: 0
00100011: 0
00100111: 0
00101001: 0
00101100: 0
0010111: 0
00101110: 0
0011: 0
00110001: 0
00110100: 0
00110101: 0
00111001: 0
001200316425: 0
00125: 0
0014

In [15]:
#Word 2 Vec:
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np

# Load the dataset
dataset_path = "/Users/jashanjeetsingh/Downloads/train-balanced-sarcasm.csv"  
df = pd.read_csv(dataset_path)

# Drop rows with NaN values in 'comment' column
df = df.dropna(subset=['comment'])

# Convert 'comment' column to string type to handle any unexpected data types
df['comment'] = df['comment'].astype(str)

# Tokenize the comments using NLTK's word_tokenize
df['tokenized_comments'] = df['comment'].apply(lambda x: word_tokenize(x))

# Train Word2Vec model
word2vec_model = Word2Vec(df['tokenized_comments'], vector_size=100, window=5, min_count=1, sg=0)

# Example: Get embedding vector for a word ('example' as an example)
if 'example' in word2vec_model.wv.index_to_key:
    embedding_vector = word2vec_model.wv['example']
    print("Embedding Vector for 'example':\n", embedding_vector)
else:
    print("'example' is not in vocabulary.")



Embedding Vector for 'example':
 [ 0.01382166 -1.191692   -1.1728013  -1.8438952   0.48504487  0.70766836
  1.324519    2.1502738  -1.8727648  -2.5405867  -0.7486824  -1.8108327
  0.7061755   0.22563879 -0.76646465  1.5048137  -0.79704833  0.01178991
  0.30177504  0.13261305 -0.82874554  2.0220487   1.6386011  -0.44844884
  0.6775283   2.7375844  -0.9880348   1.6473573  -1.0938685  -1.3688025
 -2.1775541   0.5349621   1.2013484  -0.01297436  1.0768281  -0.9414396
 -0.8581815  -1.8578545   0.36113575 -0.47645798 -0.701168   -0.9203087
  0.33881637  0.6852281  -1.6280978   0.1595596   1.0138016  -1.1484293
  0.3772921   0.78870165 -0.8016688  -0.34851936 -0.9454754  -2.309311
  0.10994098  2.4513593   3.3316891  -1.7010701   1.0078634  -1.6867412
 -1.4918989   1.2023363   2.047361   -0.08912967  1.3525152   0.35034034
 -1.1890601  -2.7892401  -4.0665965   1.6753843   0.3104056  -0.9957514
 -1.4758937  -1.3957589   0.7308066  -2.6009665  -2.2755072  -1.4627526
  0.40592554  1.0063097  -0.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Initialize CountVectorizer to compute term frequencies
vectorizer = CountVectorizer()

# Fit the vectorizer on 'comment' column and transform the data
tf_matrix = vectorizer.fit_transform(df['comment'])

# Print term frequencies for the first comment
terms = vectorizer.get_feature_names_out()
term_frequencies = tf_matrix[0].toarray().flatten()
print("Term Frequencies for the first comment:")
for term, freq in zip(terms, term_frequencies):
    print(f"{term}: {freq}")


Term Frequencies for the first comment:
00: 0
000: 0
0000: 0
000000: 0
00000000000000: 0
000000000000000000000000000000000001: 0
000000000000001: 0
000000000000438385: 0
0000000001: 0
0000000013: 0
00000001: 0
00000002: 0
0000001: 0
0000009: 0
000000957: 0
000001: 0
000002: 0
0000034: 0
00000385802: 0
000004: 0
00000991: 0
00001: 0
00001001: 0
00001011: 0
000011: 0
00002: 0
00003: 0
0000488: 0
00005: 0
000064: 0
0001: 0
00010001: 0
00010101: 0
00011001: 0
00011011: 0
00011101: 0
00017: 0
0001p: 0
0002: 0
0003: 0
000372025: 0
000378788: 0
00039be4: 0
0004: 0
00043: 0
000481871: 0
0005: 0
0006: 0
00096: 0
000c: 0
000cc: 0
000fps: 0
000ft: 0
000ge: 0
000hz: 0
000ish: 0
000k: 0
000km: 0
000kn: 0
000lb: 0
000ls: 0
000m: 0
000mb: 0
000mph: 0
000rpm: 0
000s: 0
000th: 0
000v: 0
000webhost: 0
000x: 0
000x10: 0
001: 0
00100000: 0
00100001: 0
00100011: 0
00100111: 0
00101001: 0
00101100: 0
0010111: 0
00101110: 0
0011: 0
00110001: 0
00110100: 0
00110101: 0
00111001: 0
001200316425: 0
00125: 0
0014

In [18]:
#Jashan