In [1]:
import numpy as np
import pandas as pd
import json
from random import shuffle, sample
import os

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the JSONL data
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Preprocess the data
def preprocess_data(df):
    df['label'] = df['score_delta'].apply(lambda x: 1 if x < 0 else 0) 
    return df

In [3]:
train_data = preprocess_data(load_data("data/train_sm.jsonl"))
test_data = preprocess_data(load_data("data/test_sm.jsonl"))
validation_data = preprocess_data(load_data("data/validation_sm.jsonl"))

print(train_data.head())

                                             message receiver_annotation  \
0  I see! Do you see an issue with me taking denm...                True   
1                                   Okay let me know                True   
2  Rgr.  Stand ready to support whatever you decide.                True   
3       Sidebar- what’re you gonna do about england?        NOANNOTATION   
4  Yea I’m here. I’m with you on that. Not cuttin...                True   

   sender_annotation  score_delta  label  
0               True            0      0  
1               True           -1      1  
2               True           -2      1  
3               True           -3      1  
4               True           -2      1  


In [4]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['message'])

X_train_seq = tokenizer.texts_to_sequences(train_data['message'])
X_test_seq = tokenizer.texts_to_sequences(test_data['message'])
X_validation_seq = tokenizer.texts_to_sequences(validation_data['message'])

max_seq_len = 200
X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_len, padding='post')
X_validation_padded = pad_sequences(X_validation_seq, maxlen=max_seq_len, padding='post')

y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])
y_validation = np.array(validation_data['label'])

model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_seq_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train_padded, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_validation_padded, y_validation)
)

y_test_pred_prob = model.predict(X_test_padded)
y_test_pred = (y_test_pred_prob > 0.5).astype(int)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

y_validation_pred_prob = model.predict(X_validation_padded)
y_validation_pred = (y_validation_pred_prob > 0.5).astype(int)
print("Validation Accuracy:", accuracy_score(y_validation, y_validation_pred))
print("Validation Classification Report:\n", classification_report(y_validation, y_validation_pred))


Epoch 1/10




[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 30ms/step - accuracy: 0.6701 - loss: 0.6399 - val_accuracy: 0.7218 - val_loss: 0.5968
Epoch 2/10
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.6826 - loss: 0.6298 - val_accuracy: 0.7218 - val_loss: 0.6081
Epoch 3/10
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.6822 - loss: 0.6291 - val_accuracy: 0.7218 - val_loss: 0.5967
Epoch 4/10
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.6781 - loss: 0.6316 - val_accuracy: 0.7218 - val_loss: 0.5976
Epoch 5/10
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 30ms/step - accuracy: 0.6774 - loss: 0.6306 - val_accuracy: 0.7218 - val_loss: 0.6004
Epoch 6/10
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.6825 - loss: 0.6274 - val_accuracy: 0.7218 - val_loss: 0.5987
Epoch 7/10
[1m411/411[0m 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Validation Accuracy: 0.7217514124293786
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.72      1.00      0.84      1022
           1       0.00      0.00      0.00       394

    accuracy                           0.72      1416
   macro avg       0.36      0.50      0.42      1416
weighted avg       0.52      0.72      0.61      1416



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
import gensim.downloader as api

# Load GloVe and FastText embeddings outside the class
print("Loading GloVe embeddings...")
glove_vectors = api.load('glove-wiki-gigaword-100')
fasttext_vectors = None

Loading GloVe embeddings...


In [6]:
import pandas as pd
from data_processor import DataProcessor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import nltk 

  from .autonotebook import tqdm as notebook_tqdm


In [7]:

def jsonl_to_dataframe(jsonl_file):
    data = []
    with open(jsonl_file, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

train_df = jsonl_to_dataframe('data/train_sm.jsonl')
test_df = jsonl_to_dataframe('data/test_sm.jsonl')
validation_df = jsonl_to_dataframe('data/validation_sm.jsonl')
train_df.head()

Unnamed: 0,message,receiver_annotation,sender_annotation,score_delta
0,I see! Do you see an issue with me taking denm...,True,True,0
1,Okay let me know,True,True,-1
2,Rgr. Stand ready to support whatever you decide.,True,True,-2
3,Sidebar- what’re you gonna do about england?,NOANNOTATION,True,-3
4,Yea I’m here. I’m with you on that. Not cuttin...,True,True,-2


In [8]:
nltk.download('punkt_tab')

processor = DataProcessor(train_df, glove_vectors, fasttext_vectors)
X_train = processor.fit_transform(vectorization_method='glove')
y_train = np.array(train_data['label'])

processor_test = DataProcessor(test_df, glove_vectors, fasttext_vectors)
X_test = processor_test.fit_transform(vectorization_method='glove')
y_test = np.array(test_data['label'])

processor_val = DataProcessor(validation_df, glove_vectors, fasttext_vectors)
X_validation = processor_val.fit_transform(vectorization_method='glove')
y_validation = np.array(validation_data['label'])


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Akshat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



Completed fit_transform with method: glove

Completed fit_transform with method: glove

Completed fit_transform with method: glove


In [9]:
print(X_train.shape, y_train.shape)

(13132, 50, 100) (13132,)


In [None]:
model = Sequential([
    LSTM(64, input_shape=(50, 100), return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_validation, y_validation)
)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Epoch 1/15


  super().__init__(**kwargs)


[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6743 - loss: 0.6446 - val_accuracy: 0.7218 - val_loss: 0.5984
Epoch 2/15
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.6809 - loss: 0.6281 - val_accuracy: 0.7218 - val_loss: 0.5946
Epoch 3/15
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6753 - loss: 0.6330 - val_accuracy: 0.7218 - val_loss: 0.5933
Epoch 4/15
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6814 - loss: 0.6257 - val_accuracy: 0.7218 - val_loss: 0.5992
Epoch 5/15
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6785 - loss: 0.6294 - val_accuracy: 0.7218 - val_loss: 0.5944
Epoch 6/15
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6815 - loss: 0.6245 - val_accuracy: 0.7210 - val_loss: 0.5955
Epoch 7/15
[1m411/411[0m [32m━━━━━━━

In [11]:
y_test_pred_prob = model.predict(X_test)
y_test_pred = (y_test_pred_prob > 0.5).astype(int)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

y_validation_pred_prob = model.predict(X_validation)
y_validation_pred = (y_validation_pred_prob > 0.5).astype(int)
print("Validation Accuracy:", accuracy_score(y_validation, y_validation_pred))
print("Validation Classification Report:\n", classification_report(y_validation, y_validation_pred))

[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Test Accuracy: 0.6909886902590295
Test Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.99      0.82      1901
           1       0.29      0.01      0.01       840

    accuracy                           0.69      2741
   macro avg       0.49      0.50      0.41      2741
weighted avg       0.57      0.69      0.57      2741

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Validation Accuracy: 0.719632768361582
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.72      1.00      0.84      1022
           1       0.20      0.00      0.01       394

    accuracy                           0.72      1416
   macro avg       0.46      0.50      0.42      1416
weighted avg       0.58      0.72      0.61      1416



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, Embedding

model = Sequential([
    SimpleRNN(64, return_sequences=False, activation='tanh', input_shape=(50, 100)),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data = (X_validation, y_validation)  
)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Epoch 1/25
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6671 - loss: 0.6416 - val_accuracy: 0.7218 - val_loss: 0.5945
Epoch 2/25
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6892 - loss: 0.6253 - val_accuracy: 0.7218 - val_loss: 0.5991
Epoch 3/25
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6795 - loss: 0.6328 - val_accuracy: 0.7218 - val_loss: 0.5990
Epoch 4/25
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6865 - loss: 0.6247 - val_accuracy: 0.7218 - val_loss: 0.6013
Epoch 5/25
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6744 - loss: 0.6324 - val_accuracy: 0.7218 - val_loss: 0.5976
Epoch 6/25
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6806 - loss: 0.6274 - val_accuracy: 0.7218 - val_loss: 0.6000
Epoch 7/25
[1m411/411[0m 