In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = pd.read_json('/content/drive/MyDrive/datasets/nlp/ca2/sarcasm.json', lines =True)

df['headline'] = df['headline'].str.lower()

df['headline'] = df['headline'].apply(word_tokenize)

df['headline'] = df['headline'].apply(lambda x: [word for word in x if word not in string.punctuation])

stop_words = set(stopwords.words('english'))
df['headline'] = df['headline'].apply(lambda x: [word for word in x if word not in stop_words])

lemmatizer = WordNetLemmatizer()
df['headline'] = df['headline'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [None]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,"[thirtysomething, scientist, unveil, doomsday,...",https://www.theonion.com/thirtysomething-scien...
1,0,"[dem, rep., totally, nail, congress, falling, ...",https://www.huffingtonpost.com/entry/donna-edw...
2,0,"[eat, veggie, 9, deliciously, different, recipe]",https://www.huffingtonpost.com/entry/eat-your-...
3,1,"[inclement, weather, prevents, liar, getting, ...",https://local.theonion.com/inclement-weather-p...
4,1,"[mother, come, pretty, close, using, word, 'st...",https://www.theonion.com/mother-comes-pretty-c...


In [None]:
X = df['headline']
y = df['is_sarcastic']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip /content/glove.6B.zip

In [None]:
import numpy as np

def load_glove_vectors(glove_file):
    print("Loading GloVe vectors...")
    word_embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.array(values[1:], dtype='float32')
            word_embeddings[word] = embedding
    print("GloVe vectors loaded.")
    return word_embeddings

glove_file = '/content/glove.6B.300d.txt'
word_embeddings = load_glove_vectors(glove_file)

Loading GloVe vectors...
GloVe vectors loaded.


In [None]:
def create_headline_representation(headline, word_embeddings, embedding_dim=300):
    headline_vector = np.zeros(embedding_dim)
    num_words = 0
    for word in headline:
        if word in word_embeddings:
            headline_vector += word_embeddings[word]
            num_words += 1
    if num_words > 0:
        headline_vector /= num_words
    return headline_vector

X_train_glove = np.array([create_headline_representation(headline, word_embeddings, 300) for headline in tqdm(x_train)])
X_test_glove = np.array([create_headline_representation(headline, word_embeddings, 300) for headline in tqdm(x_test)])

print("Shape of training headline representations:", X_train_glove.shape)
print("Shape of testing headline representations:", X_test_glove.shape)

100%|██████████| 22895/22895 [00:01<00:00, 21478.44it/s]
100%|██████████| 5724/5724 [00:00<00:00, 27165.20it/s]

Shape of training headline representations: (22895, 300)
Shape of testing headline representations: (5724, 300)





In [None]:
logreg = LogisticRegression(max_iter=500)

logreg.fit(X_train_glove, y_train)

y_pred = logreg.predict(X_test_glove)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.740041928721174
Precision: 0.7243303571428571
Recall: 0.72271714922049
F1-score: 0.7235228539576365
