In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_hdf('./classifier_testing_data_0105_02.h5', 'idx_df')  
data.head()

Unnamed: 0,sentence_idx,start,end,shap_value,shap_base_value,sentiment
1,0,0,4,-0.291169,8.962646,"[1.0, 0.0]"
2,0,4,5,0.027174,8.962646,"[1.0, 0.0]"
3,0,5,6,-0.018401,8.962646,"[1.0, 0.0]"
4,0,7,12,0.17922,8.962646,"[1.0, 0.0]"
5,0,12,13,0.107237,8.962646,"[1.0, 0.0]"


In [None]:
data['toxic_sentence'] = data['sentiment'].apply(lambda x: 0 if x == [0.0,1.0] else 1) 
data['non_toxic_sentence'] = data['sentiment'].apply(lambda x: 1 if x == [0.0,1.0] else 0) 

In [None]:
data_idx = data

In [None]:
data = data.drop(columns = ['sentence_idx', 'sentiment','start', 'end' ])

In [None]:
data.head(1)

Unnamed: 0,shap_value,shap_base_value,toxic_sentence,non_toxic_sentence
1,-0.291169,8.962646,1,0


In [None]:
cols = ['shap_value','shap_base_value','toxic_sentence','non_toxic_sentence']
data= data[cols]
data.head(1)

Unnamed: 0,shap_value,shap_base_value,toxic_sentence,non_toxic_sentence
1,-0.291169,8.962646,1,0


In [None]:
X_test=data

In [None]:
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

In [None]:
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestData(torch.FloatTensor(X_test))

In [None]:
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(4, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
model = BinaryClassification()
model.load_state_dict(torch.load('shap_model1.pth'))
model.to(device)

BinaryClassification(
  (layer_1): Linear(in_features=4, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
y_pred_list

[1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0

In [None]:
data_idx['result']=y_pred_list

In [None]:
data_idx

Unnamed: 0,sentence_idx,start,end,shap_value,shap_base_value,sentiment,toxic_sentence,non_toxic_sentence,result,token
1,0,0,4,-0.291169,8.962646,"[1.0, 0.0]",1,0,1.0,that
2,0,4,5,0.027174,8.962646,"[1.0, 0.0]",1,0,0.0,'
3,0,5,6,-0.018401,8.962646,"[1.0, 0.0]",1,0,0.0,s
4,0,7,12,0.179220,8.962646,"[1.0, 0.0]",1,0,0.0,right
5,0,12,13,0.107237,8.962646,"[1.0, 0.0]",1,0,0.0,.
...,...,...,...,...,...,...,...,...,...,...
511919,1999,825,827,-0.090418,8.266350,"[1.0, 0.0]",1,0,0.0,so
511920,1999,828,832,-1.038702,8.266350,"[1.0, 0.0]",1,0,1.0,dumb
511921,1999,833,836,-0.081360,8.266350,"[1.0, 0.0]",1,0,0.0,and
511922,1999,837,845,-0.323090,8.266350,"[1.0, 0.0]",1,0,0.0,ignorant


In [None]:
#data_idx['token'] = data2['token']

In [None]:
final=data_idx[data_idx.result == 1.0]

In [None]:
list_span= []
list_start =[]
list_end = []
list_start = final['start'].tolist()
list_end = final['end'].tolist()
for i in range(len(final)):
    list_span.append(np.arange(list_start[i], list_end[i]))

In [None]:
final['span'] = list_span

In [None]:
submission1 = final[final.toxic_sentence == 1]
cols = ['token','start','end','shap_value','shap_base_value','sentiment','toxic_sentence','non_toxic_sentence','result']
submission1 = submission1.drop(columns = cols)

In [None]:
submission2 = final
cols = ['token','start','end','shap_value','shap_base_value','sentiment','toxic_sentence','non_toxic_sentence','result']
submission2 = submission2.drop(columns = cols)

In [None]:
submission1.head(1)

In [None]:
submission1.to_csv('submission1.csv')

In [None]:
submission2.to_csv('submission2.csv')