In [None]:
import re
import gc
import scipy
import numpy as np
import pandas as pd
from copy import deepcopy
from string import printable
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Sklearn
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

# Transformers
from transformers import BertTokenizer
from transformers import TFBertModel, BertModel

# Keras
from keras import Model
from tensorflow.keras.optimizers import Adam
from keras.utils.vis_utils import plot_model
from keras.layers import Input, Dropout, Dense
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Constants I'll be using in the notebook
RANDOM_STATE = 201
STOPWORDS = set(STOPWORDS)
BERT_DIR = '../input/huggingface-bert/bert-base-uncased'

plt.style.use('ggplot')

# Introduction
### What We Know So Far
I've read some [Discussions](https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion) and [Notebooks](https://www.kaggle.com/c/jigsaw-toxic-severity-rating/code) for this and previous competitions that helped me understand more about the problem I'm tackling. Here is what I know:
- We don't have a proper validation set to use and must come up with a creative validation technique.
- There is something off about the "*validation_data.csv*" as discussed [here](https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/287350) and [here](https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/287140) (we also notice that later on) which results in best validation score maxing out to **82.4%**! This is because it seems ***some comments are incorrectly tagged*** (annotator are humans after all) and ***the validation set contains duplicate (less_toxic, more_toxic) pairs*** (different annotator have different opinions).
- There is no original training data for this competition and we should rely on the datasets mentioned in the competition overview and others out there.
- The objective is to rank the given comments in "*comments_to_score.csv*" based on their toxicity.
- Some of our validation data exist in the training data!

### Strategy
The choice of data is important. For that matter I've decided to use [jigsaw-multilingual-toxic-comment-classification](https://www.kaggle.com/julian3833/jigsaw-multilingual-toxic-comment-classification) and [ruddit jigsaw dataset](https://www.kaggle.com/rajkumarl/ruddit-jigsaw-dataset). My primary dataset is the first one which has labeled comments with multiple toxicity types. How we choose to combine them into an overall toxicity level is extremely important.

# Utility Scripts
## Weights
Here I have defined a dictionary that will map toxicity types to their corresponding weights. These weights are one of the most important parameters in the entire notebook:

In [None]:
# Toxicity weights - These weights are later used to combine all toxicity types into one
toxicity_weights = {
    'toxic': 1,
    'severe_toxic': 2,
    'obscene': 1,
    'threat': 1,
    'insult': 1,
    'identity_hate': 2,
    'sexual_explicit': 1
}

toxicity_types = list(toxicity_weights.keys())

## Text-Cleaning Methods
As newer language models and techniques come into play, text-cleaning is becoming less and less a necessity and more like an optional feature. But let's not forget that text-cleaning can still be of great importance in many models and scenarios. I have defined a number of functions that will help clean parts of our texts and have later on used a few I believed to be helpful.

In [None]:
HTML_TAG_PATTERN = r"<.*?>"
EMAIL_PATTERN = r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])'
URL_PATTERN = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"

def remove_html_tags(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = HTML_TAG_PATTERN, repl = replace_with, string = string)

def remove_special_characters(string: str) -> str:
    return ''.join(filter(lambda x: x in printable, string))

def remove_urls(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = URL_PATTERN, repl = replace_with, string = string)

def remove_emails(string: str, replace_with: str = '') -> str:
    return re.sub(EMAIL_PATTERN, replace_with, string)

def remove_IPs(text):
    return re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)        # 71.228.77.211

def remove_times(text):
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2} [a-zA-Z]+,? \d{4} \(UTC\)', '', text)    # 04:09, 11 Jul, 2003  
    text = re.sub(r'\d{1,2}:\d{2},? [a-zA-Z]+ \d{1,2},? \d{4} \(UTC\)', '', text)    # 16:47, Jul 23, 2004
    text = re.sub(r'\d{1,2}:\d{2},? \d{4} [a-zA-Z]+ \d{1,2},? \(UTC\)', '', text)    # 22:07, 2004 Dec 30
    text = re.sub(r'\d{1,2} [a-zA-Z]+ \d{4},? \d{1,2}:\d{2} \(UTC\)', '', text)      # 29 June 2005 22:08
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2} [a-zA-Z]+,?', '', text)                  # 21:31, 6 April
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2},?', '', text)                            # 17:52, 12
    text = re.sub(r'\d{1,2}:\d{1,2}-\d{1,2}-\d{1,2}', '', text)                      # 01:05-09-09    
    text = re.sub(r'\d{1,2}:\d{2}', '', text)                                        # 17:52, 12
    text = re.sub(r'\d{1,2} [a-zA-Z]+,? \d{4}', '', text)                            # 4 May, 2006
    
    
    text = re.sub(r'\(UTC\)', '', text)                                              # (UTC)
    return text

def shorten_consecutive_repetitions(text):
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    
    # Add space around repeated characters
    text = re.sub(r'[ ]{2,}',' ', text).strip()
    text = re.sub(r'([*!?]+)',r' \1 ', text)
    return text

In [None]:
def clean_text(text):            
    
    text = shorten_consecutive_repetitions(text)  # Remove consecutive repeated characters (threshold = 3)
    text = remove_special_characters(text)        # Remove non-ascci characters
    text = remove_html_tags(text)                 # Remove HTML tags (not their contents)
    text = remove_emails(text)                    # Remove Email Addresses
    text = remove_urls(text)                      # Remove URLs
    text = remove_times(text)                     # Remove times (refer to the definition for more info)
    text = remove_IPs(text)                       # Remove IP Addresses
    
    text = text.replace(':', ' : ')               # Add space before and after :
    text = re.sub(r'[|=\n"_\-/~]', ' ', text)           # Remove some stuff
    
    return text

## Fit, Validate and Predict
#### Validation Strategy
We don't have the straight-forward validation data as we normally have, so we I will be using the "*validation.csv*" which has two columns: *less_toxic* and *more_toxic*. We do prediction on each column and compare them pair-wise to see how many we got correct. The average number is our validation score.

I will also be using *RMSE* along side *Accuracy*. This is because *Accuracy* can be a little misleading here due to the fact that our data is heavily biased (read more [here](https://machinelearningmastery.com/failure-of-accuracy-for-imbalanced-class-distributions/))

In [None]:
# Calculate RMSE and Accuracy metrics
def validate(pipe, X_val, y_val):
    ''' Pipe must have been fitted before being passed to this function '''
    
    # RMSE
    rmse = mean_squared_error(pipe.predict(X_val), y_val, squared = False) 

    # Accuracy
    lt_pred = pipe.predict(val_df['less_toxic'])
    mt_pred = pipe.predict(val_df['more_toxic'])
    accuracy = (lt_pred < mt_pred).mean()
    
    return lt_pred, mt_pred, accuracy, rmse

#### Training and Predicting in batches
This part is a bit tricky. I use folds instead of letting the model see all the data at once. In each fold I'll select a subset of data and use it to train the model, validate it and make a prediction based on the selected subset of data. Then return the prediction results from validation and the prediction itself.

I'll later use the validation predictions to calculate optimal ensemble weights for the final prediction.

In [None]:
def fit_validate_predict(pipe, X, y, folds = 5, verbose = True):
    
    # Created folds
    skf = KFold(
        n_splits = folds,
        shuffle = True, # Default is False
        random_state = RANDOM_STATE
    )
    accuracies, rmses = np.zeros(folds), np.zeros(folds)
    lt_preds, mt_preds = np.zeros((val_df.shape[0], folds)), np.zeros((val_df.shape[0], folds))
    preds = np.zeros((test_df.shape[0], folds))

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        
        # Split the data into train and test sets
        X_train, y_train = X[train_index], y[train_index]
        X_val, y_val = X[val_index], y[val_index]
            
        # Train the pipeline
        pipe.fit(X_train, y_train)
        
        # Validate the pipeline with test_df['text'] and y_val
        lt_pred, mt_pred, accuracy, rmse = validate(pipe, X_val, y_val)
        accuracies[fold], rmses[fold] = accuracy, rmse
        lt_preds[:, fold], mt_preds[:, fold] = lt_pred, mt_pred
        
        # Make predictions
        preds[:, fold] = pipe.predict(test_df['text'])
        
        if verbose:
            print(f"FOLD #{fold + 1}) Accuracy: {accuracy.round(4)}, RMSE: {rmse.round(4)}")    
    
    return lt_preds, mt_preds, preds, accuracies, rmses

## Visualizations
Since I'll (probably) be using multiple datasets in this notebook and run pretty much the same analysis over them, I'll define a few methods to avoid code duplication:

In [None]:
# Plots number of values for each toxicity level in the given dataframe
def plot_toxic_types_dist(df):    
    fig = plt.figure(figsize = (20, 5))
    plt.title('Toxicity Categories Count')
    plt.bar([type for type in toxicity_types if type in jtc_df.columns], [df[type].value_counts()[1] for type in toxicity_types if type in df.columns], label = 'Number of occurrences')
    plt.legend()
    plt.show()

# Plots the didtribution of values in toxicity columns of the given dataframe
def plot_toxicity_dist(df):
    toxicity_values = df['toxicity'].value_counts()
    
    plt.figure(figsize = (20, 5))
    plt.title('Toxicity Level Distribution')
    plt.bar(toxicity_values.keys(), toxicity_values.values, color = 'g')
    plt.show()

# Plots the wordcloud for each toxicity level of the given data frame (Stopwords are removed)
def plot_wordcloud(df):
    wordcloud = WordCloud(stopwords = STOPWORDS)
    fig, ax = plt.subplots(3, 2, figsize = (20, 10))

    i = 0
    for row in ax:
        for col in row:        
            wordcloud.generate(' '.join(df.loc[df[toxicity_types[i]] != 0, 'text'].tolist()))
            col.set_title(toxicity_types[i])        
            col.imshow(wordcloud)        
            col.axis("off")
            i += 1
    plt.tight_layout(pad = 0)
    plt.show()

# Jigsaw Rate Severity of Toxic Comments
This is our original dataset for the competition. The columns are:
- *comment_to_score.csv*: The dataset that is used for the final predictions.
- *validation_data.csv*: The dataset that is used to validate the models.
- *sample_submission.csv*: A sample submission file.

In [None]:
val_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
test_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

print(f'test_df\n- Shape: {test_df.shape}\n- Columns: {list(test_df.columns)}')
print(f'- Duplicates: {test_df.duplicated(subset = "text").sum()}\n')

print(f'val_df\n- Shape: {val_df.shape}\n- Columns: {list(val_df.columns)}')
print(f'- Duplicates: {val_df.duplicated(subset = ["less_toxic", "more_toxic"]).sum()}!')

### Removing Duplicates in Validation Set
As mentioned before, the validation data contains duplicates. We could remove them using the following code:

In [None]:
# # Get the dupicate items
# vals_duplicate_df = val_df[['less_toxic', 'more_toxic']]

# # Drop the duplicate paires except the first occurrence (Remove the worker column as well)
# val_df = vals_duplicate_df.loc[~vals_duplicate_df.duplicated(keep = 'first')]

# print(f"- New shape: {val_df.shape}")

### Text-Cleaning

In [None]:
# val_df
val_df['less_toxic'] = val_df['less_toxic'].apply(clean_text)
val_df['more_toxic'] = val_df['more_toxic'].apply(clean_text)
print('- Validation set cleaned.')

# test_df
test_df['text'] = test_df['text'].apply(clean_text)
print('- Test set cleaned.')

# jigsaw toxic comment classification challenge
The "*jigsaw-toxic-comment-train.csv*" contains data from *train.csv* and *test.csv* of the *jigsaw-toxic-comment-classification-challenge* competition combined. (The test data and their corresponding labels have been merged)

**NOTE #1**: I will be changing the columns names to match the original dataset columns' names.

In [None]:
jtc_df = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv').rename(
    columns = {
        'id': 'comment_id',
        'comment_text': 'text'
    }
)

print(f'jtc_df\n- Shape: {jtc_df.shape}')
print(f'- Columns: {list(jtc_df.columns)}')
print(f'- Duplicates: {jtc_df.duplicated("text").sum()}')

### Combining Toxicity Types
Here I'll create a new column "*toxicity*" that is the weighted sum of all toxicity types.

In [None]:
# Combine all toxicity levels into one with the same weights set
jtc_df['toxicity'] = sum([jtc_df[type] * coef for type, coef in toxicity_weights.items() if type in jtc_df])

# Standardize toxicity (converts to continues values)
# jtc_df['toxicity'] = jtc_df['toxicity'] / jtc_df['toxicity'].max()

### Downsampling & Text-Cleaning
Our data is heavily unblanaced ([why is that bad?](https://machinelearningmastery.com/what-is-imbalanced-classification/)) and must be fixed. There are a few tricks we can pull off:
- The weights can be adjusted in a way to try balance out the data (Not recommended - We have enough data for downsampling, don't sacrifice the weights to balance the data!)
- Downsampling can drop the portion of data from the problematic side (Most effective)

In [None]:
# Downsample
jtc_df = pd.concat([
    jtc_df[jtc_df['toxicity'] <= 0].sample(n = int((jtc_df['toxicity'] > 0).sum() * 1.5),random_state = RANDOM_STATE),
    jtc_df[jtc_df['toxicity'] > 0]
])
print(f"- New shape: {jtc_df.shape}")

# Clean
jtc_df['text'] = jtc_df['text'].apply(clean_text)

### Exploratory Data Analysis
Explore fruther the datast using the following functions:

In [None]:
# plot_toxic_types_dist(jtc_df)
# plot_toxicity_dist(jtc_df)
# plot_wordcloud(jtc_df)

# ruddit jigsaw dataset
Third dataset used is the [ruddit-jigsaw-dataset](https://www.kaggle.com/rajkumarl/ruddit-jigsaw-dataset) and spesificly the "*ruddit_with_text.csv*". There are a few things worth paying attention:
- Deleted comments are marked as *[deleted]*. Do we keep them? If comment is deleted by the user then it won't have any useful information, but if it's deleted by the community, that would raise a question: why?
- I also shifted the toxicity scores to be between 0 and 1 (*toxicity* column of the previous dataset was also between 0 and 1)

**NOTE #1**: The *offensiveness_score* is probably different that *toxicity*, but I will rename the column to match the other dataframes.

In [None]:
# Select only the columns we need
rjd_df = pd.read_csv('../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv').rename(
    columns = {
        'txt': 'text',
        'offensiveness_score': 'toxicity'
    }
)[['comment_id', 'text', 'toxicity']]

# Change scale
rjd_df['toxicity'] = (rjd_df['toxicity'] - rjd_df['toxicity'].min()) / (rjd_df['toxicity'].max() - rjd_df['toxicity'].min()) 

print(f'rjd_df\n- Shape: {jtc_df.shape}')
print(f'- Columns: {list(jtc_df.columns)}')
print(f'- Duplicates: {jtc_df.duplicated("text").sum()}')

### Removing Invalid Entries & Text-Cleaning
Looking at the below histogram, I believe the comments are removed by the authors themselves not the community (there is not much pattern for the deleted comments) and I will remove them entirely.

In [None]:
# Get duplicates texts
duplicates = rjd_df['text'].duplicated(keep = 'first')

# Plot distribution of toxicity scores for deleted texts
plt.figure(figsize = (10, 5))
plt.hist(rjd_df.loc[duplicates, 'toxicity'])
plt.show()

# Drop the deleted comments
rjd_df = rjd_df.loc[rjd_df['text'] != '[deleted]']
print(f"- New shape: {val_df.shape}")

# Text Cleaing
rjd_df['text'] = rjd_df['text'].apply(clean_text)

# Ensemble: Ridge() & TfidfVectorizer()
## Creating the Pipeline

In [None]:
features = FeatureUnion([
    ('vect', TfidfVectorizer(analyzer = 'char_wb', max_df = 0.5, min_df = 3, ngram_range = (3, 5))),
])

# Define pipeline
pipe = Pipeline([
    ("features", features),
    ('ridge', Ridge(random_state = RANDOM_STATE))
])

## TfidfVectorizer & Ridge

In [None]:
# multilingual data
jtc_lt_preds, jtc_mt_preds, jtc_preds, jtc_accuracies, jtc_rmses = fit_validate_predict(
    pipe = deepcopy(pipe),            # Don't train on the original pipeline
    X = np.array(jtc_df['text']),
    y = np.array(jtc_df['toxicity']),
    folds = 5,
)

print(f"\n- Avg Accuracy: {jtc_accuracies.round(4).mean()}\n- Avg RMSE: {jtc_rmses.round(4).mean()}")

In [None]:
# ruddit data
rjd_lt_preds, rjd_mt_preds, rjd_preds, rjd_accuracies, rjd_rmses = fit_validate_predict(
    pipe = deepcopy(pipe),            # Don't train on the original pipeline
    X = np.array(rjd_df['text']),
    y = np.array(rjd_df['toxicity']),
    folds = 5,
)

print(f"\n- Avg Accuracy: {rjd_accuracies.round(4).mean()}\n- Avg RMSE: {rjd_rmses.round(4).mean()}")

## BERT Model

In [None]:
# Sequence length for BERT
SEQ_LEN = 60
LEARNING_RATE = 1e-5

# Loading transformers modules
BERT_tokenizer = BertTokenizer.from_pretrained(BERT_DIR)
BERT_base = TFBertModel.from_pretrained(BERT_DIR)

#### Encoding texts for BERT

In [None]:
# Function that encodes txts into IDs and Masks for BERT to understnad
def get_BERT_input(text, max_length) :
    input_ids = []
    attention_masks = []

    def breakdown(x):
        encoded = BERT_tokenizer.encode_plus(
            x,
            add_special_tokens = True,
            max_length = max_length,
            pad_to_max_length = True,
            return_attention_mask = True,
            truncation = True   # Truncate to max_length
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    text.apply(breakdown)
    return [np.array(input_ids), np.array(attention_masks)]

In [None]:
jtc_bert_input = get_BERT_input(jtc_df['text'], SEQ_LEN)
print('- jtc_df Done.')

val_less_toxic_bert_input = get_BERT_input(val_df['less_toxic'], SEQ_LEN)
val_more_toxic_bert_input = get_BERT_input(val_df['more_toxic'], SEQ_LEN)
print('- val_df Done.')

test_bert_input = get_BERT_input(test_df['text'], SEQ_LEN)
print('- test_df Done.')

#### Defining BERT Model Architecture

In [None]:
def get_BERT_model(model_layer, seq_len, learning_rate = 1e-5):
    
    # Create input IDs and Masks
    input_ids = Input(shape = (seq_len,), dtype = 'int32', name = 'input_ids')
    input_attention_mask = Input(shape = (seq_len,), dtype = 'int32', name = 'attention_mask')
    
    # Build the model
    output = model_layer(input_ids, attention_mask = input_attention_mask)[1]
    output = Dropout(0.2)(output)    
    output = Dense(units = 1, activation = 'sigmoid')(output)
    
    # Add input and output layers
    model = Model(inputs = [input_ids, input_attention_mask], outputs = output)
    
    # Compile the model
    model.compile(Adam(lr = learning_rate), loss = 'mse', metrics = [])    
    return model

In [None]:
model = get_BERT_model(BERT_base, SEQ_LEN, LEARNING_RATE)
model.summary()

#### Training BERT Model

In [None]:
EPOCHS = 10
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.2

# Checkpoints
checkpoint = ModelCheckpoint(
    'model.h5', 
    monitor = 'val_loss', 
    verbose = 1, 
    save_best_only = True,
    save_weights_only = True
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)
early_stop = EarlyStopping(
    restore_best_weights = True,
    patience = 2,
)

# Training the model
history = model.fit(
    jtc_bert_input,
    jtc_df['toxicity'],
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    verbose = 1,
    validation_split = VALIDATION_SPLIT,
    callbacks = [reduce_lr, checkpoint, early_stop]
)

#### Validation & Prediction for BERT Model
The *loss* and *val_loss* aren't enough metrics for us. To compare its performance against **Ridge()** I'll also validate it using *val_df*.

In [None]:
# Load best weights
model.load_weights('model.h5')

# Make predictions for validation set
bert_lt_preds = model.predict(val_less_toxic_bert_input)
bert_mt_preds = model.predict(val_more_toxic_bert_input)

print(f'- Val Accuracy: {(bert_lt_preds < bert_mt_preds).mean()}')
print(f'- Avg Loss: {np.array(history.history["loss"]).mean()}')
print(f'- Avg Val Loss: {np.array(history.history["val_loss"]).mean()}')

# Make prediction for test set
bert_preds = model.predict(test_bert_input)

## Ensemble Modeling
### Finding optimal weights

I have used *scipy.optimizer* to find the optimal weights (See [documentations](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.brute.html)) by brute forcing all possible combinations.

**NOTE**: When added a new dataset, append the corresponding *lt_preds* and *mt_preds* to the end of *params* list.

In [None]:
params = (
    [jtc_lt_preds.mean(axis = 1), rjd_lt_preds.mean(axis = 1), bert_lt_preds.mean(axis = 1)],
    [jtc_mt_preds.mean(axis = 1), rjd_mt_preds.mean(axis = 1), bert_mt_preds.mean(axis = 1)]
)

def func(x, *params):
    lt = sum([x[i] * params[0][i] for i in range(len(x))])
    mt = sum([x[i] * params[1][i] for i in range(len(x))])
    return -1 * ((lt < mt).mean())

resbrute = scipy.optimize.brute(
    func,
    ranges = ([slice(0, 1, 0.1) for _ in range(len(params[0]))]),
    args = params,
    full_output = True,
    finish = None
)

print(f'- Optimal weights: {resbrute[0]}\n- Global Minimum: {resbrute[1] * -1}')

### Calculate Final Predictions
We have the optimal weights and we have the predictions. So the final prediction can be calculated using the two.

**NOTE**: When added a new dataset, append the corresponding *_preds* to the end of *preds* list.

In [None]:
preds = [
    jtc_preds.mean(axis = 1),
    rjd_preds.mean(axis = 1),
    bert_preds.mean(axis = 1),
]

# Multiply predictions and their corresponding weighs, then sum them up
y_pred = np.array([preds[i] * resbrute[0][i] for i in range(len(preds))]).sum(axis = 0)

## Creating the Submission
**NOTE**: The predictions are ranked to get rid of any ties.

In [None]:
# Remove ties
y_pred = scipy.stats.rankdata(y_pred, method = 'ordinal')

# Create submission file
submission_df = pd.DataFrame(data = {
    'comment_id': test_df['comment_id'],
    'score': y_pred
}).to_csv('submission.csv', index = False)