In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, InputLayer, Dropout, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

from tensorflow.keras.layers import concatenate

In [2]:
full_df = pd.read_csv('politeness_strategies_merge.csv')
politeness_df = pd.read_csv('politeness_strategies_affcon_4_categories.csv')
liwc_df = pd.read_csv('LIWC2015 Results_affcon_4_categories.csv')

In [3]:
from sklearn.model_selection import train_test_split

X_data = full_df['Input.full_text']
y_data = full_df['affcon_rapport']
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, train_size=0.8)
#X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)

In [4]:
import re
import numpy as n
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
#nltk.download('punkt')

In [5]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
#REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.replace('\n', ' ').lower()# lowercase text
    #text = REPLACE_IP_ADDRESS.sub('', text)
    text = REPLACE_BY_SPACE_RE.sub(' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    return text

In [6]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 10000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [7]:
POPULAR_WORDS[:10]

['to', 'I', 'you', 'the', 'and', 'a', 'in', 'that', 'of', 'with']

In [11]:
from scipy import sparse as sp_sparse

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape)

X_train shape  (9442, 10000) 
X_val shape  (3148, 10000)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test set and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf = tfidf_vectorizer.transform(X_val)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    return X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_

X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression


def train_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

classifier_mybag = train_classifier(X_train_mybag, y_train, C = 4, regularisation = 'l2')
classifier_tfidf = train_classifier(X_train_tfidf, y_train, C = 4, regularisation = 'l2')

y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)



In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))
    
print('Bag-of-words\n')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)
print('\nTfidf\n')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)

Bag-of-words

Accuracy:  0.5552731893265566
F1-score macro:  0.52554430205319
F1-score micro:  0.5552731893265566
F1-score weighted:  0.5478786945225212
Precision macro:  0.6073985162378301
Precision micro:  0.6073985162378301
Precision weighted:  0.6073985162378301

Tfidf

Accuracy:  0.5635324015247777
F1-score macro:  0.5136687864859273
F1-score micro:  0.5635324015247777
F1-score weighted:  0.5429537667468394
Precision macro:  0.6047829021440269
Precision micro:  0.6047829021440269
Precision weighted:  0.6047829021440269


### Read df for rapport

In [3]:
full_df = pd.read_csv('politeness_strategies_merge_rapport.csv')

### Text Neural Network

In [4]:
max_length = 100

from transformers import AutoTokenizer, TFAutoModel, AutoConfig, TFAutoModelForPreTraining 

model_name = 'bert-base-uncased'
config = AutoConfig.from_pretrained(model_name)
#"microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

auto_model = TFAutoModelForPreTraining.from_pretrained(model_name, config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




All model checkpoint weights were used when initializing TFBertForPreTraining.

All the weights of TFBertForPreTraining were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForPreTraining for predictions without further training.


In [26]:
y_class = to_categorical(full_df['affcon_rapport'])
x_text = tokenizer(
    text=full_df['Input.full_text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
#x_text = full_df['Input.full_text'].apply(lambda x: tokenizer.encode(x).ids)

In [51]:
import keras
X_subtrain_tokened=full_df['Input.full_text'].apply(lambda x:tokenizer.encode(x))
x_train = keras.preprocessing.sequence.pad_sequences(X_subtrain_tokened, maxlen=30)
y_train = full_df['affcon_rapport']

In [57]:
y_labels = np.asarray(y_train).astype('float32').reshape((-1,1))

In [38]:
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
t_auto_model = auto_model.layers[0]
inputs = {'input_ids': input_ids}
t_model = t_auto_model(inputs)[1]
b = Dropout(config.hidden_dropout_prob, name='pooled_output')(t_model)
b = Dense(len(full_df['affcon_rapport'].value_counts()), activation='softmax')(b)
text_model = Model(inputs= input_ids, outputs=b)

In [39]:
text_model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [68]:
embedding_dim=128
vocab_size=9940
max_len = 30
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 30, 128)           1272320   
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 30, 128)           98816     
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 30, 64)            41216     
_________________________________________________________________
bidirectional_26 (Bidirectio (None, 30, 32)            10368     
_________________________________________________________________
bidirectional_27 (Bidirectio (None, 32)                6272      
_________________________________________________________________
dense_13 (Dense)             (None, 24)                792       
_________________________________________________________________
dense_14 (Dense)             (None, 1)                

In [69]:
history=model.fit(x=x_train, y=y_labels,
    validation_split=0.1,
    epochs=3, batch_size=32, shuffle=True) 

Epoch 1/3


InvalidArgumentError:  indices[21,20] = 15488 is not in [0, 9940)
	 [[node sequential_6/embedding_6/embedding_lookup (defined at <ipython-input-69-6e95d5090cb0>:3) ]] [Op:__inference_train_function_145137]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_6/embedding_6/embedding_lookup:
 sequential_6/embedding_6/embedding_lookup/137069 (defined at C:\Users\lynne\Anaconda3\lib\contextlib.py:112)

Function call stack:
train_function


In [13]:
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Compile the model
text_model.compile(
    optimizer = optimizer,
loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
text_model.fit(
    x=[x_text['input_ids']], y=y_class,
    validation_split=0.1,
    epochs=3, batch_size=8, shuffle=True)

Epoch 1/3
Epoch 2/3

### Concatenate numerical & Text

In [None]:
numerical_df = full_df.drop(columns=['affcon_rapport', 'Input.full_text'])

In [None]:
inputB = Input(shape=(numerical_df.shape[1],))
c = Dense(len((full_df['affcon_rapport'].value_counts()), activation='relu')(inputB)
c = Dense(4, activation='relu')(c)
c = Dense(len((full_df['affcon_rapport'].value_counts()), activation='linear')(c)
numeric_model = Model(inputs=inputB, outputs=c)

In [None]:
combined = concatenate([text_model.output, numeric_model.output])

In [None]:
z = Dense(len(full_df['affcon_rapport'].value_counts()), activation="relu")(combined)
z = Dense(1, activation="linear")(z)

In [None]:
model = Model(inputs=[text_model.input, numeric_model.input], outputs=z)

In [None]:
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Compile the model
model.compile(
    optimizer = optimizer,
loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(
    x=[x_text['input_ids'], numerical_df], y=y_class,
    validation_split=0.1,
    epochs=3, batch_size=8, shuffle=True)