# Preliminaries

In [1]:
import pandas as pd
from pathlib import Path

import numpy as np 
import pandas as pd
import scipy.special
from scipy.stats import mode
from sklearn.metrics import classification_report
from copy import deepcopy

In [2]:
out_of_sample_predictions = {
    'in': pd.read_csv('model_outputs/out_of_sample/in_domain.csv', header=[0, 1, 2]),
    'cross': pd.read_csv('model_outputs/out_of_sample/cross_domain.csv', header=[0, 1, 2])
}

trained_predictions = {
    'in': pd.read_csv('model_outputs/trained/in_domain.csv', header=[0, 1, 2]),
    'cross': pd.read_csv('model_outputs/trained/cross_domain.csv', header=[0, 1, 2])
}

df_train_olid_small = pd.read_csv('data/processed/olid-train-small.csv')
df_train_hasoc = pd.read_csv('data/processed/hasoc-train.csv')
df_test_olid = pd.read_csv('data/processed/olid-test.csv')

df_test_olid = df_test_olid.sort_values('id').set_index('id')
df_train_hasoc = df_train_hasoc.sort_values('id').set_index('id')
df_train_olid_small = df_train_olid_small.sort_values('id').set_index('id')

In [3]:
# Since these are saved pivot tables they have to be processed

for item in [out_of_sample_predictions, trained_predictions]:
    for setting in ['in', 'cross']:
        df = item[setting]
        new_col = []
        new_col.append('id')
        for col in df.columns[1:]:
            value_name, model, _ = col
            new_col.append(model + '.' + value_name)
        
        df.columns = new_col
        item[setting] = df

# Ensemble methods

In [4]:
results = {
    'in': {
        'hard': {'prediction': None, 'probability': None},
        'soft': {'prediction': None, 'probability': None},
        'meta': {'prediction': None, 'probability': None}
    },
    'cross': {
        'hard': {'prediction': None, 'probability': None},
        'soft': {'prediction': None, 'probability': None},
        'meta': {'prediction': None, 'probability': None}
    },
}

## Hard Majority

In [5]:
for setting in ['in', 'cross']:
    df = trained_predictions[setting]
    # take the mode of the model predictions
    hard_majority_pred, _ = mode(df[[col for col in df if '.predicted' in col]].values, axis=1, keepdims=False)
    results[setting]['hard']['prediction'] = hard_majority_pred

## Soft Majority

In [6]:
for setting in ['in', 'cross']:
    df = trained_predictions[setting]
    
    probas = np.concatenate((
        [1 - df[[col for col in df if '.probabilities' in col]].values], # P(no hate) (= 1 - P(hate))
        [df[[col for col in df if '.probabilities' in col]].values]      # P(hate)
    ))
    # convert shape from (classes, instances, models) to (instances, classes, models)
    probas = np.moveaxis(probas, 0, 1)

    soft_majority_prob = probas.mean(axis=2) # take the mean probability across the models
    soft_majority_pred = soft_majority_prob.argmax(axis=1) # predict class corresponding to max mean probability
    
    results[setting]['soft']['prediction'] = soft_majority_pred
    results[setting]['soft']['probability'] = soft_majority_prob

## Meta Model

### Training Data

In [7]:
train_data = {
    'in': {'X': None, 'y': None},
    'cross': {'X': None, 'y': None}
}

for setting in ['in', 'cross']:
    # Get X
    df = deepcopy(out_of_sample_predictions[setting]) # NOTE: Here we are using out of sample predictions
    df = df[['id', *[col for col in df.columns if '.probabilities' in col]]]
    df = df.sort_values('id').set_index('id')
    X_meta_train_predictions = df
    
    if setting == 'in':
        df = deepcopy(df_train_olid_small[df_train_olid_small.columns[2:]])
    elif setting == 'cross':
        df = deepcopy(df_train_hasoc[df_train_hasoc.columns[2:]])
        
    X_meta_train_features = df
    X_meta_train = X_meta_train_predictions.join(X_meta_train_features)
    train_data[setting]['X'] = X_meta_train
    
    # Get y
    if setting == 'in':
        df = deepcopy(df_train_olid_small[['labels']])
    elif setting == 'cross':
        df = deepcopy(df_train_hasoc[['labels']])

    train_data[setting]['y'] = df

### Testing data

In [8]:
test_data = {
    'in': {'X': None, 'y': None},
    'cross': {'X': None, 'y': None}
}

for setting in ['in', 'cross']:
    # Get X
    df = deepcopy(trained_predictions[setting]) # NOTE: Here we are using predictions from models on all data
    df = df[['id', *[col for col in df.columns if '.probabilities' in col]]]
    df = df.sort_values('id').set_index('id')
    X_meta_train_predictions = df
    
    df = deepcopy(df_test_olid[df_test_olid.columns[2:]])
    X_meta_train_features = df

    X_meta_train = X_meta_train_predictions.join(X_meta_train_features)
    test_data[setting]['X'] = X_meta_train
    
    # Get y
    df = deepcopy(df_test_olid[['labels']])
    test_data[setting]['y'] = df

## Training and Testing

In [10]:
import sklearn.ensemble

for setting in ['in', 'cross']:
    model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, random_state=1)
    model.fit(train_data[setting]['X'], train_data[setting]['y'])
    results[setting]['meta']['prediction'] = model.predict(test_data[setting]['X'])
    results[setting]['meta']['probability'] = model.predict_proba(test_data[setting]['X'])

  model.fit(train_data[setting]['X'], train_data[setting]['y'])
  model.fit(train_data[setting]['X'], train_data[setting]['y'])


In [11]:
for setting in ['in', 'cross']:
    idxs = []
    labels = []
    models = []
    probabilities = []
    pred = []

    for ensemble in ['soft', 'hard', 'meta']:
        idxs += list(df_test_olid.index.values.ravel())
        labels += list(df_test_olid.labels.values.ravel())
        models += [ensemble] * len(df_test_olid.index.values.ravel())
        if ensemble != 'hard':
            probabilities += list(results[setting][ensemble]['probability'][:,1])
        else:
            probabilities += [None] * len(df_test_olid.index.values.ravel())
        pred += list(results[setting][ensemble]['prediction'])
    
    df = pd.DataFrame({
        'id': idxs,
        'label': labels,
        'model': models,
        'probabilities': probabilities,
        'predicted': pred
    })
    df = pd.pivot_table(df, values= ['label', 'predicted', 'probabilities'], index=['id'], columns=['model'])
    df.to_csv('model_outputs/ensemble/' + setting + '_domain.csv')