# Detecting toxicity with relational encoding. 

## Packages

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_addons as tfa

from random import choice
from numpy import linalg as LA
from scipy.stats import rankdata

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras import backend as K 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Activation, Dense, Layer, BatchNormalization, Conv2D
from tensorflow.keras.layers import Flatten, Reshape, Dropout, LayerNormalization, GlobalAveragePooling1D

globalSeed=768

from numpy.random import seed 
seed(globalSeed)

tf.compat.v1.set_random_seed(globalSeed)
 

# Loading the data

In [None]:
TrainingDataSet = pd.read_csv('../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv')
ValidationDataSet = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
comentstoRate = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
TrainingDataSet.head()

In [None]:
ValidationDataSet.head()

In [None]:
comentstoRate.head()

# Relational encoding 

In [None]:
Characters = ['A','a','B', 'b','C', 'c','D', 'd','E', 'e','F', 'f','G', 'g','H', 'h', 'I','i','J','j','K','k','L','l', 'M','m','N', 'n','O', 'o',
              'P', 'p','Q','q','R', 'r', 'S','s','T', 't','U', 'u','V', 'v','W','w','X', 'x','Y', 'y','Z','z','?',' ','!',',','.', '"', "'", '-']

charactersLength = len(Characters)

In [None]:
def MakeRelationalMatrix(Text,Characters):
    
    CharactersToLocation = dict([(val,k) for k,val in enumerate(Characters)])
    currentMatrix = np.zeros((len(Characters),len(Characters)))
    splitText = [val for val in Text]
    
    for k in range(len(splitText)-1):
        
        currentChar = splitText[k]
        forwardChar = splitText[k+1]
        
        if currentChar in Characters and forwardChar in Characters:
            
            xLoc = CharactersToLocation[currentChar]
            yLoc = CharactersToLocation[forwardChar]
            currentMatrix[xLoc,yLoc] = currentMatrix[xLoc,yLoc] + 1
            currentMatrix[yLoc,xLoc] = currentMatrix[yLoc,xLoc] + 1
    
    w,v = LA.eig(currentMatrix)
    norm = LA.norm(w)
    
    if norm ==0.0:
        currentMatrix = np.zeros((len(Characters),len(Characters)))
    else:
        currentMatrix = currentMatrix/norm
        currentMatrix = currentMatrix/currentMatrix.max()
    
    return currentMatrix

## Examples of relational encoding 

In [None]:
fig,axes = plt.subplots(5,5,figsize=(20,20))
flataxes = axes.ravel()
index = np.arange(len(TrainingDataSet))
np.random.shuffle(index)

for k,val in enumerate(flataxes):
    val.imshow(MakeRelationalMatrix(TrainingDataSet['txt'].iloc[index[k]],Characters))

# Data

## Encoding the dataset

In [None]:
TrainingDataBase = np.array([MakeRelationalMatrix(val,Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
SubmissionData = np.array([MakeRelationalMatrix(val,Characters) for val in comentstoRate['text']]).reshape((-1,charactersLength,charactersLength,1))

## Data augmentation

In [None]:
def RandomCapitalization(string):
    return ''.join(choice((str.upper, str.lower))(c) for c in string)

def RemoveRandomCharacter(string):
    current = choice(Characters)
    return ''.join( c for c in string if  c not in current.lower() )

def RemoveCharacters(string,remove=10):
    index = np.arange(len(string))
    np.random.shuffle(index)
    return ''.join( string[k] for k in range(len(string)) if k not in index[0:remove])

In [None]:
TrainingData00 = np.array([MakeRelationalMatrix(val.lower(),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
TrainingData01 = np.array([MakeRelationalMatrix(val.upper(),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
TrainingData02 = np.array([MakeRelationalMatrix(val.title(),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
TrainingData03 = np.array([MakeRelationalMatrix(RandomCapitalization(val),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
TrainingData04 = np.array([MakeRelationalMatrix(RemoveRandomCharacter(val),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
TrainingData05 = np.array([MakeRelationalMatrix(RemoveCharacters(val),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
TrainingData06 = np.array([MakeRelationalMatrix(RandomCapitalization(RemoveCharacters(val)),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))
TrainingData07 = np.array([MakeRelationalMatrix(RemoveRandomCharacter(RemoveCharacters(val)),Characters) for val in TrainingDataSet['txt']]).reshape((-1,charactersLength,charactersLength,1))

In [None]:
TrainingData = np.vstack([TrainingDataBase,TrainingData00,TrainingData01,TrainingData02,TrainingData03,TrainingData04,TrainingData05,
                         TrainingData06,TrainingData07])
TrainingTarget = np.hstack([np.array(TrainingDataSet['offensiveness_score']),np.array(TrainingDataSet['offensiveness_score']),
                            np.array(TrainingDataSet['offensiveness_score']),np.array(TrainingDataSet['offensiveness_score']),
                            np.array(TrainingDataSet['offensiveness_score']),np.array(TrainingDataSet['offensiveness_score']),
                            np.array(TrainingDataSet['offensiveness_score']),np.array(TrainingDataSet['offensiveness_score']),
                            np.array(TrainingDataSet['offensiveness_score'])])

# Stacked MLP mixer network

## Network Construction

In [None]:
class SpatialAttention(Layer):
    '''
    Custom Spatial attention layer
    '''
    
    def __init__(self,size, **kwargs):
        super(SpatialAttention, self).__init__()
        self.size = size
        self.kwargs = kwargs
        
    def get_config(self):
        cfg = super().get_config()
        return cfg    

    def build(self, input_shapes):
        self.conv = Conv2D(filters=1, kernel_size=self.size, strides=1, padding='same')
        
    def call(self, inputs):
        pooled_channels = tf.concat(
            [tf.math.reduce_max(inputs, axis=3, keepdims=True),
            tf.math.reduce_mean(inputs, axis=3, keepdims=True)],
            axis=3)

        scale = self.conv(pooled_channels)
        scale = tf.math.sigmoid(scale)

        return inputs * scale

class Patches(Layer):
    '''
    Taken from
    https://keras.io/examples/vision/mlp_image_classification/
    '''
    def __init__(self, patch_size, num_patches):
        super(Patches, self).__init__()
        self.patch_size = patch_size
        self.num_patches = num_patches
    
    def get_config(self):
        cfg = super().get_config()
        return cfg    
        
    @tf.autograph.experimental.do_not_convert
    def call(self, images,**kwargs):
        
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, self.num_patches, patch_dims])
        return patches


class MLPMixerLayer(Layer):
    '''
    Taken from
    https://keras.io/examples/vision/mlp_image_classification/
    '''
    def __init__(self, num_patches, hidden_units, dropout_rate, *args, **kwargs):
        super(MLPMixerLayer, self).__init__(*args, **kwargs)

        self.mlp1 = keras.Sequential(
            [
                Dense(units=num_patches,use_bias=False),
                BatchNormalization(),
                tfa.layers.GELU(),
                Dense(units=num_patches,use_bias=False),
                BatchNormalization()
                #Dropout(rate=dropout_rate),
            ]
        )
        self.mlp2 = keras.Sequential(
            [
                Dense(units=num_patches,use_bias=False),
                BatchNormalization(),
                tfa.layers.GELU(),
                Dense(units=hidden_units,use_bias=False),
                BatchNormalization(),
                #Dropout(rate=dropout_rate),
            ]
        )
        self.normalize = LayerNormalization(epsilon=1e-6)
        
    def get_config(self):
        cfg = super().get_config()
        return cfg    
    
    @tf.autograph.experimental.do_not_convert
    def call(self, inputs,**kwargs):
        # Apply layer normalization.
        x = self.normalize(inputs)
        # Transpose inputs from [num_batches, num_patches, hidden_units] to [num_batches, hidden_units, num_patches].
        x_channels = tf.linalg.matrix_transpose(x)
        # Apply mlp1 on each channel independently.
        mlp1_outputs = self.mlp1(x_channels)
        # Transpose mlp1_outputs from [num_batches, hidden_dim, num_patches] to [num_batches, num_patches, hidden_units].
        mlp1_outputs = tf.linalg.matrix_transpose(mlp1_outputs)
        # Add skip connection.
        x = mlp1_outputs + inputs
        # Apply layer normalization.
        x_patches = self.normalize(x)
        # Apply mlp2 on each patch independtenly.
        mlp2_outputs = self.mlp2(x_patches)
        # Add skip connection.
        x = x + mlp2_outputs
        return x


def MakeMixerBlock(inputs,blocks,patch_size,num_patches,embedding_dim,dropout_rate):
    '''
    Parameters
    ----------
    inputs : keras layer
        Input of the mixer block.
    blocks : keras sequential model
        mixer blocks.
    patch_size : int
        size of the image patch, same for each dimention.
    num_patches : int
        number of patches per image.
    embedding_dim : int
        size of the embedding dimention in the mixer block.
    dropout_rate : float
        droput rate in the mixer block.
    Returns
    -------
    representation : keras layer 
        DESCRIPTION.
    '''
    
    patches = Patches(patch_size, num_patches)(inputs)
    x = Dense(units=embedding_dim)(patches)
    x = blocks(x)
    x = GlobalAveragePooling1D()(x)
    x = BatchNormalization()(x)
    #x = Dropout(rate=dropout_rate)(x)
    reshapeDim = np.sqrt(embedding_dim).astype(int)
    representation = Reshape((reshapeDim,reshapeDim,1))(x)
    
    return representation

def MakeMixerCoder(InputShape,Units,NumBlocks,DropoutRate=0.2,PatchSize=4,UpSampling=False):
    '''
    Parameters
    ----------
    InputShape : tuple
        Input shape of the network.
    Units : array-like
        Contains the dimentionality of the embedding dimentions.
    NumBlocks : int
        Number of mixer blocks.
    DropoutRate : float, optional
        Dropout rate of the mixer block. The default is 0.2.
    PatchSize : int, optional
        size of the segmented patch in the image. The default is 4.
    UpSampling : bool, optional
        Controls the upsamplig or downsampling behaviour of the network.
        The default is False.
    Returns
    -------
    InputFunction : Keras functional model input
        input of the network.
    localCoder : Keras functional model
        Coder model, main body of the autoencoder.
    '''
    
    if UpSampling:
        EmbeddingDimentions=Units[::-1]
    else:
        EmbeddingDimentions=Units
        
    currentSize = np.sqrt(EmbeddingDimentions[0]).astype(int)
    num_patches = (currentSize//PatchSize)**2
    InputFunction = Input(shape = InputShape)
    X = SpatialAttention(3)(InputFunction)
    MBlocks = keras.Sequential(
        [MLPMixerLayer(num_patches, EmbeddingDimentions[0], DropoutRate) for _ in range(NumBlocks)]
        )
    
    X = MakeMixerBlock(InputFunction,MBlocks,PatchSize,num_patches,EmbeddingDimentions[0],DropoutRate)

    for k in range(1,len(EmbeddingDimentions)):
        
        currentSize = np.sqrt(EmbeddingDimentions[k-1]).astype(int)
        num_patches = (currentSize//PatchSize)**2
        
        X = SpatialAttention(3)(X)
        X = BatchNormalization()(X)
        
        MBlocks =  keras.Sequential(
            [MLPMixerLayer(num_patches, EmbeddingDimentions[k], DropoutRate) for _ in range(NumBlocks)]
            )
        X = MakeMixerBlock(X,MBlocks,PatchSize,num_patches,EmbeddingDimentions[k],DropoutRate)

    if UpSampling:
        Output = Activation('sigmoid')(X)
        localCoder = Model(inputs=InputFunction,outputs=Output)
        
    else:
        localCoder = Model(inputs=InputFunction,outputs=X)
    
    return InputFunction,localCoder


def MakeRegressor(InputShape,Units,NumBlocks,**kwargs):
    
    InputNetwork,Network = MakeMixerCoder(InputShape,Units,NumBlocks,**kwargs)
    outShape = Network.layers[-1].output_shape
    
    InputReg = Input(shape=outShape[1::])
    X = Flatten()(InputReg)
    X = Dense(1,use_bias=False)(X)
    X = BatchNormalization()(X)
    Output = Activation('tanh')(X)
    
    Reg = Model(inputs=InputReg,outputs=Output)
    RegressorOutput = Reg(Network(InputNetwork))    
    Regressor = Model(inputs=InputNetwork,outputs=RegressorOutput)
    
    return InputNetwork, Regressor

## Network Hyperparameters

In [None]:
arch = [charactersLength**2,(charactersLength//2)**2,(charactersLength//4)**2,(charactersLength//8)**2]
lr = 0.0001
minlr = 0.000001
epochs = 22
batch_size = 64
decay = 2*(lr-minlr)/epochs

_,Regressor = MakeRegressor((charactersLength,charactersLength,1),arch,1)

Regressor.summary()

# Training

## Data

In [None]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(TrainingData,TrainingTarget,test_size=0.1, random_state=42)

## Training and cross validation

In [None]:
RegressorModels = []

folds = 5
k = 0 
kf = KFold(n_splits=folds,shuffle=True,random_state=65)

for train_index, test_index in kf.split(Xtrain):
    
    localXtrain, localXtest = Xtrain[train_index], Xtrain[test_index]
    localYtrain, localYtest = Ytrain[train_index], Ytrain[test_index]
        
    _,Regressor = MakeRegressor((charactersLength,charactersLength,1),arch,1)
    Regressor.compile(Adam(learning_rate=lr,decay=decay),loss='mse')
    history =  Regressor.fit(Xtrain,Ytrain,epochs=epochs,validation_data=(Xtest,Ytest))
    
    Regressor.save('./model'+str(k)+'.h5')
    k = k+1
    RegressorModels.append(Regressor)

In [None]:
fig,axs = plt.subplots(1,folds,figsize=(35,5))

for k,val in enumerate(RegressorModels):
    axs[k].plot(val.predict(Xtest).reshape((-1,)),Ytest,'bo')

In [None]:
fig,axs = plt.subplots(1,folds,figsize=(35,5))

for k,val in enumerate(RegressorModels):
    axs[k].hist(val.predict(Xtest).reshape((-1,)),bins=50,color='blue')
    axs[k].hist(Ytest,bins=50,color='blue',alpha=0.5)

# Submission

In [None]:
container = []

fig,axs = plt.subplots(1,folds,figsize=(35,5))

for k,val in enumerate(RegressorModels):
    
    preds = val.predict(SubmissionData).reshape((-1,))
    container.append(preds)
    axs[k].hist(preds)


In [None]:
finalPreds = np.array(container).mean(axis=0)
plt.hist(finalPreds)

In [None]:
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
#submission['score'] = finalPreds
submission['score'] = rankdata(finalPreds, method='ordinal')
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)