In [1]:
import numpy as np
import pandas as pd
import json
from random import shuffle, sample
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
train_df = pd.read_json('data/train.jsonl', lines=True)
train_df.head()

Unnamed: 0,messages,sender_labels,receiver_labels,speakers,receivers,absolute_message_index,relative_message_index,seasons,years,game_score,game_score_delta,players,game_id
0,[Germany!\n\nJust the person I want to speak w...,"[True, True, True, True, True, True, True, Tru...","[True, True, True, True, NOANNOTATION, NOANNOT...","[italy, germany, italy, germany, italy, italy,...","[germany, italy, germany, italy, germany, germ...","[74, 76, 86, 87, 89, 92, 97, 117, 119, 121, 12...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[Spring, Spring, Spring, Spring, Spring, Sprin...","[1901, 1901, 1901, 1901, 1901, 1901, 1901, 190...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[italy, germany]",1
1,[Hello there! What's your general plan for thi...,"[True, False, True, False, True, True, True, T...","[True, True, True, True, True, NOANNOTATION, T...","[austria, italy, austria, italy, italy, austri...","[italy, austria, italy, austria, austria, ital...","[1, 67, 71, 73, 98, 99, 101, 179, 181, 185, 18...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[Spring, Spring, Spring, Spring, Spring, Sprin...","[1901, 1901, 1901, 1901, 1901, 1901, 1901, 190...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 4, 4, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 1, -1, -...","[italy, austria]",1
2,[Buongiorno! \nBe kinda nice to know if you're...,"[True, True, False, True, True, True, True, Tr...","[True, False, True, False, True, True, NOANNOT...","[russia, italy, russia, italy, russia, italy, ...","[italy, russia, italy, russia, italy, russia, ...","[11, 50, 52, 57, 61, 66, 77, 85, 96, 102, 116,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[Spring, Spring, Spring, Spring, Spring, Sprin...","[1901, 1901, 1901, 1901, 1901, 1901, 1901, 190...","[4, 3, 4, 3, 4, 3, 4, 3, 3, 3, 4, 3, 3, 4, 4, ...","[1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1...","[italy, russia]",1
3,[Hey italy! good luck this game. I'm guessing ...,"[True, False, True, True, True, True, True, Tr...","[NOANNOTATION, True, True, False, True, True, ...","[england, italy, england, england, england, it...","[italy, england, italy, italy, italy, england,...","[32, 95, 106, 107, 108, 110, 113, 125, 126, 12...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[Spring, Spring, Spring, Spring, Spring, Sprin...","[1901, 1901, 1901, 1901, 1901, 1901, 1901, 190...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[italy, england]",1
4,[Hello Italy what’s up what are your thoughts ...,"[True, False, False, True, True, True, True, T...","[NOANNOTATION, True, True, True, True, True, N...","[turkey, italy, italy, italy, turkey, italy, t...","[italy, turkey, turkey, turkey, italy, turkey,...","[45, 94, 103, 150, 154, 178, 192, 194, 195, 19...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[Spring, Spring, Spring, Spring, Fall, Fall, F...","[1901, 1901, 1901, 1901, 1901, 1901, 1901, 190...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 1...","[italy, turkey]",1


In [3]:
def to_single_message_format(gamefile):
    messages = []
    with open(gamefile) as inh:
        for ln in inh:
            conversation = json.loads(ln)
            for msg, sender_label, receiver_label,  speaker, receiver, abs_index, rel_index, season, year, game_score, game_score_delta in zip(
                conversation['messages'], conversation['sender_labels'], conversation['receiver_labels'], 
                conversation['speakers'], conversation['receivers'], 
                conversation['absolute_message_index'], conversation['relative_message_index'], 
                conversation['seasons'], conversation['years'], conversation['game_score'], 
                conversation['game_score_delta']):
                
                messages.append({
                    'message': msg,
                    'receiver_annotation': receiver_label,
                    'sender_annotation': sender_label,
                    'speaker': speaker,
                    'receiver': receiver,
                    'absolute_message_index': abs_index,
                    'relative_message_index': rel_index,
                    'season': season,
                    'year': int(year),
                    'game_score': int(game_score),
                    'game_score_delta': int(game_score_delta),
                    'game_id': conversation['game_id']
                })
    shuffle(messages)
    return messages

In [4]:
def write_single_messages(messages, outfile):
    with open(outfile, "w") as outh:
        for msg in messages:
            outh.write(json.dumps(msg)+'\n')

In [5]:
ROOT = "data"
write_single_messages(to_single_message_format(os.path.join(ROOT, 'validation.jsonl')) , os.path.join(ROOT, 'validation_sm.jsonl'))
write_single_messages(to_single_message_format(os.path.join(ROOT, 'train.jsonl')) , os.path.join(ROOT, 'train_sm.jsonl'))
write_single_messages(to_single_message_format(os.path.join(ROOT, 'test.jsonl')) ,  os.path.join(ROOT, 'test_sm.jsonl'))

---

In [12]:
# Load the JSONL data
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Preprocess the data
def preprocess_data(df):
    df['label'] = df['score_delta'].apply(lambda x: 1 if x < 0 else 0) 
    return df


train_data = preprocess_data(load_data("data/train_sm.jsonl"))
test_data = preprocess_data(load_data("data/test_sm.jsonl"))
validation_data = preprocess_data(load_data("data/validation_sm.jsonl"))

print(train_data.head())

                                             message receiver_annotation  \
0  I see! Do you see an issue with me taking denm...                True   
1                                   Okay let me know                True   
2  Rgr.  Stand ready to support whatever you decide.                True   
3       Sidebar- what’re you gonna do about england?        NOANNOTATION   
4  Yea I’m here. I’m with you on that. Not cuttin...                True   

   sender_annotation  score_delta  label  
0               True            0      0  
1               True           -1      1  
2               True           -2      1  
3               True           -3      1  
4               True           -2      1  


In [13]:
vectorizer = TfidfVectorizer(max_features=500)
X_train_text = vectorizer.fit_transform(train_data['message'])
X_test_text = vectorizer.transform(test_data['message'])
X_validation_text = vectorizer.transform(validation_data['message'])

y_train = train_data['label']
y_test = test_data['label']
y_validation = validation_data['label']


model = LogisticRegression()
model.fit(X_train_text, y_train)


y_test_pred = model.predict(X_test_text)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))


y_validation_pred = model.predict(X_validation_text)
print("Validation Accuracy:", accuracy_score(y_validation, y_validation_pred))
print("Validation Classification Report:\n", classification_report(y_validation, y_validation_pred))

Test Accuracy: 0.6847865742429771
Test Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.96      0.81      1901
           1       0.40      0.05      0.10       840

    accuracy                           0.68      2741
   macro avg       0.55      0.51      0.45      2741
weighted avg       0.61      0.68      0.59      2741

Validation Accuracy: 0.7161016949152542
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.97      0.83      1022
           1       0.41      0.05      0.08       394

    accuracy                           0.72      1416
   macro avg       0.57      0.51      0.46      1416
weighted avg       0.64      0.72      0.62      1416



In [7]:
def jsonl_to_dataframe(jsonl_file):
    data = []
    with open(jsonl_file, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)


In [8]:
df = jsonl_to_dataframe('data/train_sm.jsonl')
df.head()

Unnamed: 0,message,receiver_annotation,sender_annotation,speaker,receiver,absolute_message_index,relative_message_index,season,year,game_score,game_score_delta,game_id
0,Thoughts on builds?,True,True,france,england,345,87,Spring,1902,5,1,1
1,Yup. Gonna head east/south now. Gotta eliminat...,True,True,germany,france,891,76,Winter,1907,8,-2,7
2,"So would I, Italy! I think I'm going to focus ...",False,False,france,italy,88,1,Fall,1901,3,0,8
3,Anonymity is tough to keep up with the discord...,True,True,france,germany,1079,110,Spring,1905,5,0,1
4,It would appear that way,True,True,germany,england,134,8,Fall,1901,3,0,5


In [9]:
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import emoji
import numpy as np
import torch
from torch.utils.data import Dataset
import gensim.downloader as api

nltk.download('stopwords')
nltk.download('punkt')

# Load GloVe and FastText embeddings outside the class
print("Loading GloVe embeddings...")
glove_vectors = api.load('glove-wiki-gigaword-100')
print("Loading FastText embeddings...")
fasttext_vectors = api.load('fasttext-wiki-news-subwords-300')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading GloVe embeddings...
Loading FastText embeddings...


In [10]:
import nltk
nltk.download('punkt_tab') ## This shouldn't be here but the code isn't working without it
import torch

class DataProcessor:
    def __init__(self, dataframe, glove_vectors, fasttext_vectors):
        self.df = dataframe
        self.stop_words = set(stopwords.words('english'))
        self.glove_vectors = glove_vectors
        self.fasttext_vectors = fasttext_vectors
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def preprocess_text(self, text):
        # Remove emojis
        text = emoji.replace_emoji(text, replace='')
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove stop words and extra spaces
        text = ' '.join([word for word in word_tokenize(text.lower()) if word not in self.stop_words])
        return text

    def preprocess_dataset(self):
        self.df['cleaned_message'] = self.df['message'].apply(self.preprocess_text)

    def vectorize(self, method='tfidf'):
        if method == 'tfidf':
            vectorizer = TfidfVectorizer()
            vectors = vectorizer.fit_transform(self.df['cleaned_message'])
        elif method == 'glove':
            vectors = self._get_embeddings(self.df['cleaned_message'], self.glove_vectors)
        elif method == 'fasttext':
            vectors = self._get_embeddings(self.df['cleaned_message'], self.fasttext_vectors)
        elif method == 'bert':
            vectors = self._get_bert_embeddings(self.df['cleaned_message'])
        else:
            raise ValueError("Unsupported vectorization method")
        return vectors

    def _get_bert_embeddings(self, texts):
        from transformers import BertTokenizer, BertModel
        
        # Load pre-trained BERT model and tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased').to(self.device)
        
        # Tokenize and get BERT embeddings
        encoded_inputs = tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            return_tensors='pt',
            max_length=512
        ).to(self.device)
        
        with torch.no_grad():
            outputs = model(**encoded_inputs)
            # Use the [CLS] token representation as the sentence embedding
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        return embeddings

    def _get_embeddings(self, texts, embedding_model, max_length=50):
        embeddings = []
        for text in texts:
            tokens = word_tokenize(text)
            text_embeddings = []
            for token in tokens[:max_length]:
                try:
                    embedding = embedding_model[token]
                except KeyError:
                    embedding = np.zeros(embedding_model.vector_size)
                text_embeddings.append(embedding)
            if len(text_embeddings) < max_length:
                padding = [np.zeros(embedding_model.vector_size)] * (max_length - len(text_embeddings))
                text_embeddings.extend(padding)
            embeddings.append(np.array(text_embeddings))
        return np.array(embeddings)

    def fit_transform(self, vectorization_method='tfidf'):
        self.preprocess_dataset()
        vectors = self.vectorize(method=vectorization_method)
        print("\nCompleted fit_transform with method:", vectorization_method)
        return vectors

# Example usage:
processor = DataProcessor(df, glove_vectors, fasttext_vectors)
vectors = processor.fit_transform(vectorization_method='glove')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



Completed fit_transform with method: glove


In [13]:
import json
import pandas as pd
from feature_extractor import FeatureExtractor

def read_jsonl(file_path):
    """Read messages and additional fields from a JSONL file."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

def extract_features_from_messages(data):
    """Extract features from a list of messages using FeatureExtractor and include additional fields."""
    feature_extractor = FeatureExtractor()
    features_list = []
    for entry in data:
        # Extract features from the message
        features = feature_extractor.extract_features(entry['message'])
        # Include additional fields as features
        features.update({
            'receiver_annotation': entry['receiver_annotation'],
            'sender_annotation': entry['sender_annotation'],
            'speaker': entry['speaker'],
            'receiver': entry['receiver'],
            'absolute_message_index': entry['absolute_message_index'],
            'relative_message_index': entry['relative_message_index'],
            'season': entry['season'],
            'year': entry['year'],
            'game_score': entry['game_score'],
            'game_score_delta': entry['game_score_delta'],
            'game_id': entry['game_id']
        })
        features_list.append(features)
    return features_list

def create_dataframe(features_list):
    """Create a pandas DataFrame from a list of feature dictionaries."""
    return pd.DataFrame(features_list)

def process_jsonl_to_dataframe(file_path):
    """Process a JSONL file to a pandas DataFrame with extracted features and additional fields."""
    data = read_jsonl(file_path)
    features_list = extract_features_from_messages(data)
    df = create_dataframe(features_list)
    return df

# Example usage
file_path = 'data/train_sm.jsonl'  # Replace with your actual file path
df = process_jsonl_to_dataframe(file_path)
print(df.head())

   sentiment_polarity  avg_sentence_length  avg_word_length  type_token_ratio  \
0            0.321429                 18.0         3.500000          0.888889   
1            0.650000                  4.0         3.416667          0.916667   
2           -0.194444                  9.0         5.111111          1.000000   
3            0.000000                  0.0         0.000000          0.000000   
4           -0.312500                  5.5         4.181818          1.000000   

   function_word_count  pronoun_usage  third_person_pronoun_count  \
0                    2              2                           0   
1                    4              1                           0   
2                    2              0                           0   
3                    0              0                           0   
4                    2              1                           0   

   flesch_reading_ease  flesch_kincaid_grade  comma_count  ...  \
0                87.05          

In [14]:
df.head(20)

Unnamed: 0,sentiment_polarity,avg_sentence_length,avg_word_length,type_token_ratio,function_word_count,pronoun_usage,third_person_pronoun_count,flesch_reading_ease,flesch_kincaid_grade,comma_count,...,sender_annotation,speaker,receiver,absolute_message_index,relative_message_index,season,year,game_score,game_score_delta,game_id
0,0.321429,18.0,3.5,0.888889,2,2,0,87.05,5.6,0,...,True,italy,austria,1215,301,Fall,1902,4,-1,2
1,0.65,4.0,3.416667,0.916667,4,1,0,84.68,4.4,0,...,True,england,italy,335,70,Fall,1901,3,0,3
2,-0.194444,9.0,5.111111,1.0,2,0,0,45.42,9.2,0,...,True,italy,england,225,32,Fall,1901,3,0,3
3,0.0,0.0,0.0,0.0,0,0,0,206.84,-15.7,0,...,True,england,germany,1336,284,Fall,1905,6,-1,3
4,-0.3125,5.5,4.181818,1.0,2,1,0,78.25,4.8,0,...,True,germany,italy,2122,260,Fall,1907,7,-3,1
5,0.0,8.0,4.125,1.0,2,0,0,80.28,4.1,0,...,True,turkey,italy,919,42,Fall,1902,4,0,9
6,0.0,5.0,3.8,1.0,1,1,0,117.16,-1.9,0,...,True,germany,england,2975,456,Spring,1907,10,2,2
7,0.033333,11.0,3.909091,1.0,5,2,0,85.69,4.0,1,...,True,russia,italy,834,37,Winter,1906,4,-1,7
8,0.55,9.0,3.888889,0.888889,10,1,0,91.61,3.8,0,...,True,france,italy,729,27,Spring,1903,5,0,3
9,0.6,13.0,3.333333,0.74359,8,5,1,78.08,7.0,4,...,True,austria,russia,350,44,Spring,1902,5,0,10


In [15]:
df.shape

(13132, 33)