In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, InputLayer, Dropout, Dense, Flatten, Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

from tensorflow.keras.layers import concatenate
from sklearn.model_selection import train_test_split

In [2]:
full_df = pd.read_csv('politeness_strategies_merge.csv')

In [3]:
X = full_df['Input.full_text'].to_list()
y = full_df['affcon_rapport'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

In [4]:
max_length = 100

from transformers import AutoTokenizer, TFAutoModel, AutoConfig, TFAutoModelForPreTraining 

model_name = 'bert-base-uncased'
config = AutoConfig.from_pretrained(model_name)
#"microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

auto_model = TFAutoModelForPreTraining.from_pretrained(model_name, config=config)

All model checkpoint layers were used when initializing TFBertForPreTraining.

All the layers of TFBertForPreTraining were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForPreTraining for predictions without further training.


In [22]:
Y_train_class = to_categorical(y_train)
X_train_text = tokenizer(
    text=X_train,
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [23]:
Y_test_class = to_categorical(y_train)
X_test_text = tokenizer(
    text=X_test,
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [7]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto',
    baseline=None, restore_best_weights=False
)

Using TensorFlow backend.


In [24]:
# LSTM

input_ids_in = tf.keras.layers.Input(shape=(100,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(100,), name='masked_token', dtype='int32') 

embedding_layer = auto_model(input_ids_in)[0]
#cls_token = embedding_layer[:,0,:]
X = tf.keras.layers.BatchNormalization()(embedding_layer)
X = tf.keras.layers.LSTM(64, return_sequences=True)(X)
X = tf.keras.layers.LSTM(32, return_sequences=True)(X)
X = tf.keras.layers.LSTM(16, return_sequences=True)(X)
X = tf.keras.layers.LSTM(16)(X)
X = tf.keras.layers.Dense(24, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(24, activation='relu')(X)
X = tf.keras.layers.Dense(2, activation='sigmoid')(X)
model = tf.keras.Model(inputs=input_ids_in, outputs = X)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

#model.summary()

history = model.fit(X_train_text['input_ids'], Y_train_class, epochs=1, batch_size=16, 
                    validation_split=0.2)
#loss, accuracy, f1_score, precision, recall = model.evaluate(X_test_text['input_ids'], Y_test_class, verbose=0)
#print(precision, recall, f1_score)



In [27]:
import json

with open('betrayal_data.json') as f:
    betrayal_data = json.load(f)

In [None]:
# All seasons in a game as one
betrayal_arr = []
rapport_arr = []

def predict_rapport(vm):
    bow = vm['frequent_words']
    tokenized_words = tokenizer(
        text = ' '.join(bow),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = False,
        verbose = True)
    
    #print(bow)
    #print(tokenized_words)
    
    y_predict = model.predict(tokenized_words['input_ids'])
    rapport_or_not = np.argmax(y_predict)
    
    return rapport_or_not

for season in betrayal_data:
    betrayal = season['betrayal']
    victim_rapport = []
    betrayer_rapport = []

    for s in season['seasons']:
        victim_messages = s['messages']['victim']
        for vm in victim_messages:
            try:
                rapport_or_not = predict_rapport(vm)
                victim_rapport.append(rapport_or_not)
            except:
                victim_rapport.append(0)
                
            rapport_obj = {}
            rapport_obj['text'] = ' '.join(vm['frequent_words'])
            rapport_obj['rapport'] = rapport_or_not
            rapport_arr.append(rapport_obj)

        betrayer_messages = s['messages']['betrayer']
        for vm in betrayer_messages:    
            try:
                rapport_or_not = predict_rapport(vm)
                betrayer_rapport.append(rapport_or_not)
            except:
                betrayer_rapport.append(0)
                
            rapport_obj = {}
            rapport_obj['text'] = ' '.join(vm['frequent_words'])
            rapport_obj['rapport'] = rapport_or_not
            rapport_arr.append(rapport_obj)

    bet_obj = {}
    bet_obj['victim_rapport'] = victim_rapport
    bet_obj['betrayer_rapport'] = betrayer_rapport
    bet_obj['betrayal'] = betrayal
    bet_obj['victim_rapport_score'] = sum(victim_rapport)
    bet_obj['betrayer_rapport_score'] = sum(betrayer_rapport)
    
    #print(bet_obj)
    betrayal_arr.append(bet_obj)

In [25]:
import pandas as pd

df = pd.DataFrame(betrayal_arr)
df.head()

NameError: name 'betrayal_arr' is not defined

In [61]:
df.to_csv('betrayal_with_rapport.csv', index=False)

In [62]:
rapport_df = pd.DataFrame(rapport_arr)
rapport_df.to_csv('betrayal_with_rapport_texts.csv', index=False)

In [20]:
# Testing
freq_words = ['?', 'going', 'for', 'ser', 'balance', 'a', 'to', 'of', 'give', '.', 'the', 'supply', 'in', 'mos', 'you', 'you', 'it', 'as', 'stp', 'rather', '...', 'to', 'to', 'retreat', 'be', 'could', 'out', '.', "i'd", ',', 'about', 'your', 'appreciated', 'lose', 'or', 'would', 'center', ',', 'germany', 'after', 'me', 'thanks', 'this', 'a', 'bud', 'i', 'there', 'and', 'perhaps']
word_string = ' '.join(freq_words)

# tokenized_words = tokenizer(
#     text=word_string,
#     add_special_tokens=True,
#     max_length=max_length,
#     pad_to_max_length=True,
#     truncation=True,
#     padding=True, 
#     return_tensors='tf',
#     return_token_type_ids = False,
#     return_attention_mask = False,
#     verbose = True)

tokenized_words = tokenizer(
    text=word_string,
    return_tensors='tf',
    padding='max_length',
    max_length=100
    )

In [21]:
tokenized_words['input_ids'].shape

TensorShape([1, 100])

In [37]:
freq_words = ['just', 'bot', ',', '.', 'take', 'unit', 'war', 'retreat', "didn't", 'to', 'turn', 'really', 'mos', 'from', '.', ',', 'nwy', 'prefer', 'using', 'stp', 'if', 'that', 'of', 'i', 'can', 'and', 'me', 'in', "i'd", 'your', 'this']
word_string = ' '.join(freq_words)

tokenized_words = tokenizer(
    text=word_string,
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
tokenized_words['input_ids'].shape

TensorShape([1, 40])

In [57]:
y_predict = model.predict(tokenized_words['input_ids'])

InvalidArgumentError:  Incompatible shapes: [1,57,768] vs. [1,100,768]
	 [[node functional_11/tf_bert_for_pre_training_1/bert/embeddings/add (defined at C:\Users\lynne\anaconda3\envs\tf-gpu\lib\site-packages\transformers\modeling_tf_bert.py:187) ]] [Op:__inference_predict_function_274044]

Errors may have originated from an input operation.
Input Source operations connected to node functional_11/tf_bert_for_pre_training_1/bert/embeddings/add:
 functional_11/tf_bert_for_pre_training_1/bert/embeddings/Identity (defined at C:\Users\lynne\anaconda3\envs\tf-gpu\lib\site-packages\transformers\modeling_tf_bert.py:183)	
 functional_11/tf_bert_for_pre_training_1/bert/embeddings/position_embeddings/embedding_lookup/Identity_1 (defined at C:\Users\lynne\anaconda3\envs\tf-gpu\lib\site-packages\transformers\modeling_tf_bert.py:185)

Function call stack:
predict_function


In [42]:
y_predict

array([[0.3813222, 0.5405293]], dtype=float32)

In [26]:
a = [1,1,0,0]
sum(a)

2