In [1]:
import pandas as pd
import re   
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import scipy
import numpy as np
import torch
from tqdm import tqdm
import torch.nn as nn
from random import shuffle
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report


In [2]:
# def vect_to_matrix(vect,start_index,n):
#     res = []
#     for k in tqdm(range(n)):
#         coo = scipy.sparse.coo_matrix(vect[start_index+k])

#         values = coo.data
#         indices = np.vstack((coo.row, coo.col))
#         i = torch.LongTensor(indices)
#         v = torch.FloatTensor(values)
#         shape = coo.shape

#         int_list = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

#         res.append(int_list)
        
#     return res
    

In [3]:
# Aphost lookup dict
APPO = {
"aren't" : "are not","can't" : "cannot","couldn't" : "could not","didn't" : "did not","doesn't" : "does not",
"don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","he'd" : "he would",
"he'll" : "he will","he's" : "he is","i'd" : "I would","i'd" : "I had","i'll" : "I will","i'm" : "I am","isn't" : "is not","it's" : "it is","it'll":"it will",
"i've" : "I have","let's" : "let us","mightn't" : "might not","mustn't" : "must not","shan't" : "shall not",
"she'd" : "she would","she'll" : "she will","she's" : "she is","shouldn't" : "should not","that's" : "that is",
"there's" : "there is","they'd" : "they would","they'll" : "they will","they're" : "they are","they've" : "they have",
"we'd" : "we would","we're" : "we are","weren't" : "were not","we've" : "we have","what'll" : "what will","what're" : "what are",
"what's" : "what is","what've" : "what have","where's" : "where is","who'd" : "who would","who'll" : "who will",
"who're" : "who are","who's" : "who is","who've" : "who have","won't" : "will not","wouldn't" : "would not",
"you'd" : "you would","you'll" : "you will","you're" : "you are","you've" : "you have","'re": " are","wasn't": "was not",
"we'll":" will","didn't": "did not","tryin'":"trying"
}

eng_stopwords = set(stopwords.words("english"))
lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

In [4]:
def clean(comment):
    #Convert to lower case , so that Hi and hi are the same
    comment = comment.lower()
    #remove \n
    comment = re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment = re.sub("\[\[.*\]","",comment)
    
    #Split the sentences into words

    words = tokenizer.tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    words =[APPO[word] if word in APPO else word for word in words]
    words =[lem.lemmatize(word, "v") for word in words]
    words = [w for w in words if not w in eng_stopwords]
    
    clean_sent =" ".join(words)
    # remove any non alphanum, digit character
    clean_sent = re.sub("\W+"," ",clean_sent)
    clean_sent = re.sub("  "," ",clean_sent)
    return(clean_sent)

In [5]:
def train_loop(batch,batch_y,optimizer,loss,net):
    net.train()
    out = net(batch)
    L = loss(out,batch_y)
    optimizer.zero_grad()
    L.backward()
    optimizer.step()
    
    return L

In [6]:
train = pd.read_csv("Project_train.csv")

In [7]:
merge = train.iloc[:,0:2]
corpus = merge.comment_text
clean_corpus = corpus.apply(lambda x :clean(x))

X_train = clean_corpus
y_train = train.toxic

In [8]:
y_train3 = torch.tensor(y_train,dtype = torch.long)

In [9]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [50]:
# Running this can take significant amount of time, we saved this
# tensor below from an old run and load it in a cell below

X_train2 = model.encode([X_train.iloc[0]])
for i in tqdm(range(1,X_train.shape[0])):
    passage_embedding_new = model.encode([X_train.iloc[i]])
    X_train2 = np.append(X_train2,passage_embedding_new,0)

100%|██████████| 159570/159570 [2:20:14<00:00, 18.96it/s]  


In [53]:
torch.save(X_train2, "X_train2_bert.pt")

In [11]:
X_train2 = torch.load("X_train2_bert.pt")

In [12]:
X_train3 = torch.from_numpy(X_train2)

In [13]:
X_train4 = X_train3
y_train4 = y_train3

In [14]:
from random import shuffle

In [15]:
def train_model(net, loss, optimizer, out_path, epochs):
    losses = []
    train_reports = []
    test_reports = []
    for j in range(epochs):
        # for i in tqdm(range(0,X_train2.shape[0],16)):
        inds = list(range(0, X_train2.shape[0], 16))
        shuffle(inds)
        for i in tqdm(inds):
            batch = X_train4[i:i+16]
            batch_y = y_train4[i:i+16]
            batch = batch.to(device)
            batch_y = batch_y.to(device)
            L = train_loop(batch,batch_y,optimizer,loss,net)
            losses.append(L)

        y_all_pred = []
        for i in tqdm(range(0,X_train2.shape[0],256)):
            batch = X_train4[i:i+256]
            batch_y = y_train4[i:i+256]
            batch = batch.to(device)
            batch_y = batch_y.to(device)
            y_pred = test_loop(batch,batch_y,net)
            y_all_pred.extend(y_pred.tolist())
        if j == epochs - 1:
            print("Train report")
            print(classification_report(y_train4.tolist(), y_all_pred))
        train_reports.append(str(classification_report(y_train4.tolist(), y_all_pred)))

        y_all_pred = []
        for i in tqdm(range(0,X_test2.shape[0],256)):
            batch = X_test3[i:i+256]
            batch_y = y_test3[i:i+256]
            batch = batch.to(device)
            batch_y = batch_y.to(device)
            y_pred = test_loop(batch,batch_y,net)
            y_all_pred.extend(y_pred.tolist())
        if j == epochs - 1:
            print("Test report")
            print(classification_report(y_test3.tolist(), y_all_pred))
        test_reports.append(str(classification_report(y_test3.tolist(), y_all_pred)))
    torch.save({
        "train_reports": train_reports,
        "test_reports": test_reports,
        "losses": losses,
    }, out_path)

# Test

In [20]:
load_dict1 = torch.load('train_model.temp.pt')

net = load_dict1['model']
# vect = load_dict1['vectorizer']
# tf_idf = load_dict1['tf_idf']

test = pd.read_csv("Project_test.csv")
test_labels = pd.read_csv("Project_test_labels.csv")

test_new = test.merge(test_labels,on = 'id')

test_new = test_new[test_new.toxic != -1]
test_new.reset_index( drop = True, inplace = True)

merge1 = test_new.iloc[:,0:2]
corpus1 = merge1.comment_text
clean_corpus1 = corpus1.apply(lambda x :clean(x))
X_test = clean_corpus1
y_test = test_new.toxic

y_test3 = torch.tensor(y_test,dtype = torch.long)

X_test2 = torch.load("X_test2_bert.pt")

# X_test2 = model.encode([X_test.iloc[0]])
# for i in range(1,X_test.shape[0]):
#     passage_embedding_new = model.encode([X_test.iloc[i]])
#     X_test2 = np.append(X_test2,passage_embedding_new,0)

X_test3 = torch.from_numpy(X_test2)

def test_loop(batch,batch_y,net):
    net.eval()
    with torch.no_grad():
        out = net(batch)
        
    _,y_predicted = torch.topk(out,1,dim = 1) 
    correct = batch_y == y_predicted.squeeze(1)
   
    
    return y_predicted.squeeze(1)

# y_all_pred = []
# for i in tqdm(range(0,X_test2.shape[0],4)):
#     batch = X_test3[i:i+4]
#     batch_y = y_test3[i:i+4]
#     batch = batch.to(device)
#     batch_y = batch_y.to(device)
#     y_pred = test_loop(batch,batch_y,net)
#     y_all_pred.extend(y_pred.tolist())

# print(classification_report(y_test3.tolist(), y_all_pred))

# Experiments

In [21]:
epochs = 1

device = torch.device("cuda:0")
net = nn.Sequential(
    nn.Linear(384,128),
    nn.Dropout(0.5),
    nn.ReLU(),
    nn.Linear(128,2),
    nn.Softmax(dim=1)
)

loss = nn.CrossEntropyLoss(weight = torch.tensor([1,1],dtype= torch.float))

optimizer = torch.optim.Adam(net.parameters(),lr = 1e-3, weight_decay=1e-4)

net = net.to(device)
loss = loss.to(device)

train_model(net, loss, optimizer, "sbert_balanced_mlp_2.pt", epochs)

# device = torch.device("cuda:0")
# net = nn.Sequential(
#     nn.Linear(384,128),
#     nn.Dropout(0.5),
#     nn.ReLU(),
#     nn.Linear(128,64),
#     nn.Dropout(0.5),
#     nn.ReLU(),
#     nn.Linear(64,2),
#     nn.Softmax(dim=1)
# )

# loss = nn.CrossEntropyLoss(weight = torch.tensor([1,1],dtype= torch.float))

# optimizer = torch.optim.Adam(net.parameters(),lr = 1e-3, weight_decay=1e-4)

# net = net.to(device)
# loss = loss.to(device)

# train_model(net, loss, optimizer, "sbert_balanced_mlp_3.pt", epochs)

# device = torch.device("cuda:0")
# net = nn.Sequential(
#     nn.Linear(384,128),
#     nn.Dropout(0.5),
#     nn.ReLU(),
#     nn.Linear(128,2),
#     nn.Softmax(dim=1)
# )

# loss = nn.CrossEntropyLoss(weight = torch.tensor([1,3],dtype= torch.float))

# optimizer = torch.optim.Adam(net.parameters(),lr = 1e-3, weight_decay=1e-4)

# net = net.to(device)
# loss = loss.to(device)

# train_model(net, loss, optimizer, "sbert_weighted_mlp_2.pt", epochs)

# device = torch.device("cuda:0")
# net = nn.Sequential(
#     nn.Linear(384,128),
#     nn.Dropout(0.5),
#     nn.ReLU(),
#     nn.Linear(128,64),
#     nn.Dropout(0.5),
#     nn.ReLU(),
#     nn.Linear(64,2),
#     nn.Softmax(dim=1)
# )

# loss = nn.CrossEntropyLoss(weight = torch.tensor([1,3],dtype= torch.float))

# optimizer = torch.optim.Adam(net.parameters(),lr = 1e-3, weight_decay=1e-4)

# net = net.to(device)
# loss = loss.to(device)

# train_model(net, loss, optimizer, "sbert_weighted_mlp_3.pt", epochs)

100%|██████████| 9974/9974 [00:14<00:00, 711.53it/s]
100%|██████████| 624/624 [00:00<00:00, 2439.18it/s]


Train report
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    144277
           1       0.80      0.60      0.69     15294

    accuracy                           0.95    159571
   macro avg       0.88      0.79      0.83    159571
weighted avg       0.94      0.95      0.94    159571



100%|██████████| 250/250 [00:00<00:00, 2508.62it/s]


Test report
              precision    recall  f1-score   support

           0       0.96      0.95      0.96     57888
           1       0.58      0.64      0.61      6090

    accuracy                           0.92     63978
   macro avg       0.77      0.80      0.78     63978
weighted avg       0.93      0.92      0.92     63978



In [22]:
# for j in range(10):
#     # for i in tqdm(range(0,X_train2.shape[0],16)):
#     inds = list(range(0, X_train2.shape[0], 4))
#     shuffle(inds)
#     for i in tqdm(inds):
#         batch = X_train4[i:i+4]
#         batch_y = y_train4[i:i+4]
#         batch = batch.to(device)
#         batch_y = batch_y.to(device)
#         L = train_loop(batch,batch_y,optimizer,loss,net)
        
#     y_all_pred = []
#     for i in tqdm(range(0,X_train2.shape[0],256)):
#         batch = X_train4[i:i+256]
#         batch_y = y_train4[i:i+256]
#         batch = batch.to(device)
#         batch_y = batch_y.to(device)
#         y_pred = test_loop(batch,batch_y,net)
#         y_all_pred.extend(y_pred.tolist())
#     print("Train report")
#     print(classification_report(y_train4.tolist(), y_all_pred))
        
#     y_all_pred = []
#     for i in tqdm(range(0,X_test2.shape[0],256)):
#         batch = X_test3[i:i+256]
#         batch_y = y_test3[i:i+256]
#         batch = batch.to(device)
#         batch_y = batch_y.to(device)
#         y_pred = test_loop(batch,batch_y,net)
#         y_all_pred.extend(y_pred.tolist())
#     print("Test report")
#     print(classification_report(y_test3.tolist(), y_all_pred))