In [59]:
import json
import nltk
import re
import string
import pandas as pd
import numpy as np
from sklearn import svm
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
target_columns = ['messages', 'sender_labels', 'receiver_labels',
                  'speakers', 'receivers', 'absolute_message_index',
                  'seasons', 'years', 'game_id']

stopwords = nltk.corpus.stopwords.words('english')
punctuations = string.punctuation + '\’\”.'

In [62]:
def read_messages_from_file(filename):
    games = []

    with open('data/' + filename) as f:
        for line in f:
            game = json.loads(line)
            games.append(game)
            
    df = pd.DataFrame(games)
    df = df[target_columns]
    df = df.set_index('game_id').apply(pd.Series.explode).reset_index()
    df = df.drop(df.loc[df['sender_labels'].isnull()].index)
    return df

In [63]:
train_df = read_messages_from_file('train.jsonl')
train_df.loc[train_df['game_id'] == 1]

Unnamed: 0,game_id,messages,sender_labels,receiver_labels,speakers,receivers,absolute_message_index,seasons,years
0,1,Germany!\n\nJust the person I want to speak wi...,True,True,italy,germany,74,Spring,1901
1,1,"You've whet my appetite, Italy. What's the sug...",True,True,germany,italy,76,Spring,1901
2,1,👍,True,True,italy,germany,86,Spring,1901
3,1,It seems like there are a lot of ways that cou...,True,True,germany,italy,87,Spring,1901
4,1,"Yeah, I can’t say I’ve tried it and it works, ...",True,NOANNOTATION,italy,germany,89,Spring,1901
...,...,...,...,...,...,...,...,...,...
2614,1,"Hi turkey, how do you feel about your neighbor...",True,True,france,turkey,26,Spring,1901
2615,1,Well yeah depends on the neighbors,True,True,turkey,france,42,Spring,1901
2616,1,Any additional thoughts after negotiations by ...,True,True,france,turkey,140,Spring,1901
2617,1,No not yet everyone seems rather tepid,True,True,turkey,france,155,Fall,1901


In [6]:
train_df['sender_labels'].unique()

array([True, False], dtype=object)

In [37]:
labels_col = train_df['sender_labels']
len(labels_col.unique())

# len(train_df['sender_labels'].unique()) == 2

2

In [7]:
def remove_emojis(message):
    emoji_pattern = re.compile(
                pattern = u"[\U0001F600-\U0001F64F"     # emoticons
                            "\U0001F300-\U0001F5FF"     # symbols & pictographs
                            "\U0001F680-\U0001F6FF"     # transport & map symbols
                            "\U0001F1E0-\U0001FAD6]+",  # flags (iOS)
                flags = re.UNICODE)
    return emoji_pattern.sub(r'', message)

In [8]:
def clean_message(message):
    message = (message.replace('\n', ' ')
                      .replace('-', ' - ')
                      .replace('...', ' ')
                      .replace('???', '?')
              )
    message = remove_emojis(message)
    
    return message.strip()

In [9]:
train_df['messages'] = train_df['messages'].apply(lambda x: clean_message(x))
train_df = train_df.drop(train_df.loc[train_df['messages'] == ''].index)

In [10]:
train_df['messages']

0        Germany!  Just the person I want to speak with...
1        You've whet my appetite, Italy. What's the sug...
3        It seems like there are a lot of ways that cou...
4        Yeah, I can’t say I’ve tried it and it works, ...
5        I am just sensing that you don’t like this ide...
                               ...                        
13131    Is there any way of me actually ending this co...
13132        Can we agree on peace? What are your demands?
13133    Neutrality in exchange for current holdings, S...
13134    Thats a bit too much, can I keep Spain and i h...
13135                                           Any deals?
Name: messages, Length: 13091, dtype: object

In [11]:
# Check if labels are balanced
train_df['sender_labels'].value_counts()  #highly unbalanced

True     12502
False      589
Name: sender_labels, dtype: int64

In [12]:
# train_labels = train_df['sender_labels']
# train_df = train_df.drop(['sender_labels'], axis=1)

In [13]:
# train_df.shape, train_labels.shape

In [14]:
# train_labels = train_labels.values.reshape((len(train_labels), 1)).astype(int)

In [15]:
# train_labels.shape

In [16]:
# oversampler = SMOTE(random_state=2)
# X, y = oversampler.fit_resample(train_df, train_labels.ravel())

In [17]:
def tokenize_and_remove_stopwords(message):
    tokens = nltk.tokenize.casual.casual_tokenize(message, reduce_len=True)
    tokens = [t for t in tokens if t not in stopwords and t not in punctuations]

    return tokens

In [18]:
def create_vectorizer(training_data):
    vectorizer = TfidfVectorizer(max_df=0.90,
                                 max_features=100000,
                                 min_df=0.05,
                                 stop_words=stopwords,
                                 use_idf=True,
                                 tokenizer=tokenize_and_remove_stopwords,
                                 ngram_range=(1,3)
                                )
    vectorizer.fit_transform(training_data)
    return vectorizer

In [19]:
def train_message_classifier(X_train, y_train):
    classifier = svm.SVC(C=1, kernel='linear', decision_function_shape='ovo')
    classifier.fit(X_train, y_train)
    return classifier

In [20]:
vectorizer = create_vectorizer(train_df['messages'])
training_matrix = vectorizer.transform(train_df['messages']).todense()

In [21]:
oversampler = SMOTE(sampling_strategy='minority', k_neighbors=5)

train_labels = train_df['sender_labels'].astype(int)
X, y = oversampler.fit_resample(training_matrix, train_labels.ravel())

In [22]:
from collections import Counter
counter = Counter(y)
counter

Counter({1: 12502, 0: 12502})

In [23]:
classifier = train_message_classifier(X, y)

In [24]:
def evaluate(classifier, messages, real_labels):
    validation_matrix = vectorizer.transform(messages).todense()
    predictions = classifier.predict(validation_matrix)
    predictions = [bool(p) for p in list(predictions)]

    df = pd.DataFrame({
        'Messages': messages,
        'Prediction': predictions,
        'Reality': real_labels,
    })
    
#     print(type(predictions), type(real_labels))
    print(classification_report(list(real_labels), predictions))
    
    return df

### Validate

In [29]:
val_df = read_messages_from_file('validation.jsonl')

In [30]:
val_results = evaluate(classifier, val_df['messages'], val_df['sender_labels'])

              precision    recall  f1-score   support

       False       0.06      0.59      0.11        56
        True       0.97      0.62      0.76      1360

    accuracy                           0.62      1416
   macro avg       0.52      0.61      0.43      1416
weighted avg       0.94      0.62      0.73      1416



In [31]:
val_results.loc[val_results['Reality'] == False]

Unnamed: 0,Messages,Prediction,Reality
163,"Neither of us have a good reason to stab you, ...",False,False
165,"I’ll do that, it keeps turkey from snagging Gre",True,False
167,"So am I, I’d expect they go through me before ...",True,False
304,"I would say Trieste, Greece to me, and then Se...",True,False
333,"On the other hand, I’m not sure I want to atta...",False,False
404,I also thought I would let you know that I hea...,False,False
405,So... I guess your new? What do you think abou...,False,False
421,"While that might be true, what would you sugge...",True,False
429,I am willing to concede it to you if you coope...,True,False
430,"Hmm. I messaged austria, and they didn't respo...",False,False


### Test

In [32]:
test_df = read_messages_from_file('test.jsonl')
evaluate(classifier, test_df['messages'], test_df['sender_labels'])

              precision    recall  f1-score   support

       False       0.11      0.48      0.18       240
        True       0.93      0.63      0.75      2501

    accuracy                           0.61      2741
   macro avg       0.52      0.55      0.46      2741
weighted avg       0.86      0.61      0.70      2741



Unnamed: 0,Messages,Prediction,Reality
0,"Hi Italy! Just opening up communication, and I...",False,True
1,"Well....that's a great question, and a lot of ...",False,True
2,"Well, if you want to attack France in the Medi...",False,True
3,"Hello, I'm just asking about your move to Tyro...",True,True
4,Totally understandable - but did you notice th...,False,True
...,...,...,...
2736,"Interesting, I didn't mean to take Naples- I f...",False,False
2737,Interesting choice to move to Albania as it me...,True,True
2738,*Austria can retreat to Greece,True,True
2739,"This game is over, spending more than 2 minute...",False,True


In [33]:
# Test on own message
test_message1 = "Austria won't move ADR out, they'll support Ven with it I imagine." \
               "I'm not quite sure whether I can trust you so it may take me a turn or two to see." \
               "I'll attack Ven and if you can support with Rom that would be most appreciated. "
test_label1 = False
evaluate(classifier, [test_message1], [test_label1])

              precision    recall  f1-score   support

       False       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



Unnamed: 0,Messages,Prediction,Reality
0,"Austria won't move ADR out, they'll support Ve...",False,False


In [34]:
test_message2 = "Sorry I also meant should I move to Rom so you can take Nap with APU? " \
                "otherwise you will need to lose two armies."
test_label2 = True
evaluate(classifier, [test_message2], [test_label2])

              precision    recall  f1-score   support

        True       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



Unnamed: 0,Messages,Prediction,Reality
0,Sorry I also meant should I move to Rom so you...,True,True


In [529]:
remove_emojis('👍🤦a')


'a'

In [376]:
def read_messages_from_file(filename):
    games = []

    with open(filename) as f:
        for line in f:
            game = json.loads(line)
            games.append(game)
            
    messages = []
    labels = []
    print('Features', games[0].keys())

    for m in [game.get('messages') for game in games]:
        messages.extend(m)

    for label in [game.get('sender_labels') for game in games]:
        labels.extend(label)
        
    return messages, labels

In [377]:
training_messages, training_labels = read_messages_from_file('train.jsonl')

Features dict_keys(['messages', 'sender_labels', 'receiver_labels', 'speakers', 'receivers', 'absolute_message_index', 'relative_message_index', 'seasons', 'years', 'game_score', 'game_score_delta', 'players', 'game_id'])


In [378]:
len(training_messages), len(training_labels)

(13132, 13132)

In [272]:
set(training_labels)

{False, True}

In [273]:
training_messages[:3]

['Germany!\n\nJust the person I want to speak with. I have a somewhat crazy idea that I’ve always wanted to try with I/G, but I’ve never actually convinced the other guy to try it. And, what’s worse, it might make you suspicious of me. \n\nSo...do I suggest it?\n\nI’m thinking that this is a low stakes game, not a tournament or anything, and an interesting and unusual move set might make it more fun? That’s my hope anyway.\n\nWhat is your appetite like for unusual and crazy?',
 "You've whet my appetite, Italy. What's the suggestion?",
 '👍']

In [274]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [275]:
punctuations = string.punctuation + '\’\”.'

In [294]:
def tokenize_and_stem(message):
    message = (message.replace('\n', ' ')
                      .replace('-', ' - ')
                      .replace('...', ' ')
                      .replace('???', '?')
              )
    message = remove_emojis(message)
    tokens = nltk.tokenize.casual.casual_tokenize(message, reduce_len=True)
    tokens = [t for t in tokens if t not in stopwords and t not in punctuations]
#     lemma = [lemmatizer.lemmatize(token) for token in tokens]
    
#     print(message, '\n' ,tokens)
    return tokens

In [295]:
def create_vectorizer(training_data):
    vectorizer = TfidfVectorizer(max_df=0.90,
                                 max_features=100000,
                                 min_df=0.05,
                                 stop_words=stopwords,
                                 use_idf=True,
                                 tokenizer=tokenize_and_stem,
                                 ngram_range=(1,3)
                                )
    vectorizer.fit_transform(training_data)
    return vectorizer

In [296]:
def train_message_classifier(X_train, y_train):
    classifier = svm.SVC(C=1, kernel='linear', decision_function_shape='ovo')
    classifier.fit(X_train, y_train)
    return classifier

In [297]:
vectorizer = create_vectorizer(training_messages)
# print(np.array(vectorizer.get_feature_names()))
training_matrix = vectorizer.transform(training_messages).todense()
classifier = train_message_classifier(training_matrix, training_labels)

In [369]:
def evaluate(classifier, messages, real_labels):
    validation_matrix = vectorizer.transform(messages).todense()
    predictions = classifier.predict(validation_matrix)

    
    print(classification_report(real_labels, predictions))
    print('\n')

#     print(list(predictions), list(real_labels))
    df = pd.DataFrame({
        'Messages': messages,
        'Prediction': list(predictions),
        'Reality': real_labels,
    })
#     return df
    return df.loc[df['Prediction'] != df['Reality']]

### Validate

In [370]:
validation_messages, validation_labels = read_messages_from_file('validation.jsonl')

In [371]:
len(validation_messages), len(validation_labels)

(1416, 1416)

In [372]:
wrong_predictions = evaluate(classifier, validation_messages, validation_labels)
wrong_predictions.head()

              precision    recall  f1-score   support

       False       0.00      0.00      0.00        56
        True       0.96      1.00      0.98      1360

    accuracy                           0.96      1416
   macro avg       0.48      0.50      0.49      1416
weighted avg       0.92      0.96      0.94      1416





Unnamed: 0,Messages,Prediction,Reality
163,"Neither of us have a good reason to stab you, ...",True,False
165,"I’ll do that, it keeps turkey from snagging Gre",True,False
167,"So am I, I’d expect they go through me before ...",True,False
304,"I would say Trieste, Greece to me, and then Se...",True,False
333,"On the other hand, I’m not sure I want to atta...",True,False


In [74]:
# from test set
test_message = validation_messages[25]
test_label = validation_labels[25]
print(test_message, test_label)

Our interests are aligned in both sides of the map, so let's just coordinate closely True


In [75]:
test_matrix = vectorizer.transform([test_message]).todense()
evaluate(classifier, test_matrix, [test_label])

Prediction: [ True]
              precision    recall  f1-score   support

        True       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [76]:
# own message
test_message = "I'm going for Ven this round. Get your army ready!"
test_label = False

test_matrix = vectorizer.transform([test_message]).todense()
evaluate(classifier, test_matrix, [test_label])


Prediction: [ True]
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       1.0
        True       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



In [58]:
class Music:
    COLS = ['hi', 'hello']
    
    def trycall(self):
        print('Normal method')
        print(self.COLS)
#         self.stop()
        self.__class__.stop()
#         self.play()
        
    @classmethod
    def play(cls):
        print("*playing music*")
        cls.stop()
    
    
    def stop():
        print("stop playing")

# Music.play()
music = Music()
music.trycall()

Normal method
['hi', 'hello']
stop playing
