In [1]:
import pandas as pd
import re   
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import scipy
import numpy as np
import torch
from tqdm import tqdm
import torch.nn as nn
from random import shuffle


In [2]:
def vect_to_matrix(vect,start_index,n):
    res = []
    for k in tqdm(range(n)):
        coo = scipy.sparse.coo_matrix(vect[start_index+k])

        values = coo.data
        indices = np.vstack((coo.row, coo.col))
        i = torch.LongTensor(indices)
        v = torch.FloatTensor(values)
        shape = coo.shape

        int_list = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

        res.append(int_list)
        
    return res
    

In [3]:
# Aphost lookup dict
APPO = {
"aren't" : "are not","can't" : "cannot","couldn't" : "could not","didn't" : "did not","doesn't" : "does not",
"don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","he'd" : "he would",
"he'll" : "he will","he's" : "he is","i'd" : "I would","i'd" : "I had","i'll" : "I will","i'm" : "I am","isn't" : "is not","it's" : "it is","it'll":"it will",
"i've" : "I have","let's" : "let us","mightn't" : "might not","mustn't" : "must not","shan't" : "shall not",
"she'd" : "she would","she'll" : "she will","she's" : "she is","shouldn't" : "should not","that's" : "that is",
"there's" : "there is","they'd" : "they would","they'll" : "they will","they're" : "they are","they've" : "they have",
"we'd" : "we would","we're" : "we are","weren't" : "were not","we've" : "we have","what'll" : "what will","what're" : "what are",
"what's" : "what is","what've" : "what have","where's" : "where is","who'd" : "who would","who'll" : "who will",
"who're" : "who are","who's" : "who is","who've" : "who have","won't" : "will not","wouldn't" : "would not",
"you'd" : "you would","you'll" : "you will","you're" : "you are","you've" : "you have","'re": " are","wasn't": "was not",
"we'll":" will","didn't": "did not","tryin'":"trying"
}

eng_stopwords = set(stopwords.words("english"))
lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

In [4]:
def clean(comment):
    #Convert to lower case , so that Hi and hi are the same
    comment = comment.lower()
    #remove \n
    comment = re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment = re.sub("\[\[.*\]","",comment)
    
    #Split the sentences into words

    words = tokenizer.tokenize(comment)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    words =[APPO[word] if word in APPO else word for word in words]
    words =[lem.lemmatize(word, "v") for word in words]
    words = [w for w in words if not w in eng_stopwords]
    
    clean_sent =" ".join(words)
    # remove any non alphanum, digit character
    clean_sent = re.sub("\W+"," ",clean_sent)
    clean_sent = re.sub("  "," ",clean_sent)
    return(clean_sent)

In [5]:
def train_loop(batch,batch_y,optimizer,loss,net):
    net.train()
    out = net(batch)
    L = loss(out,batch_y)
    optimizer.zero_grad()
    L.backward()
    optimizer.step()
    
    return L

In [6]:
train = pd.read_csv("Project_train.csv")

In [7]:
merge = train.iloc[:,0:2]
corpus = merge.comment_text
clean_corpus = corpus.apply(lambda x :clean(x))

X_train = clean_corpus
y_train = train.toxic

In [8]:
y_train3 = torch.tensor(y_train,dtype = torch.long)

In [9]:
vect = CountVectorizer()
tf_idf = TfidfTransformer()
X_train2 = vect.fit_transform(X_train)
X_train2 = tf_idf.fit_transform(X_train2)

In [10]:
X_train3 = vect_to_matrix(X_train2,0,X_train2.shape[0])

100%|██████████| 159571/159571 [01:29<00:00, 1787.72it/s]


In [12]:
X_train4 = X_train3
y_train4 = y_train3

In [20]:
device = torch.device("cpu")
net = nn.Sequential(nn.Linear(198959,64),nn.ReLU(),nn.Linear(64,32),nn.ReLU(),nn.Linear(32,2),nn.Softmax(dim=1))

loss = nn.CrossEntropyLoss(weight = torch.tensor([1,2],dtype= torch.float))

optimizer = torch.optim.Adam(net.parameters(),lr = 1e-3, weight_decay=1e-3)

net = net.to(device)
loss = loss.to(device)

In [21]:
for j in range(1):
    # for i in tqdm(range(0,X_train2.shape[0],16)):
    for i in tqdm(range(0,100,16)):
        batch = torch.cat(X_train4[i:i+16])
        batch_y = y_train4[i:i+16]
        batch = batch.to(device)
        batch_y = batch_y.to(device)
        L = train_loop(batch,batch_y,optimizer,loss,net)

100%|██████████| 7/7 [00:00<00:00, 21.69it/s]


In [15]:
saved_dict = {}
saved_dict['model'] = net
saved_dict['vectorizer'] = vect
saved_dict['tf_idf'] = tf_idf
torch.save(saved_dict,'train_model.temp.pt')

## TEST

In [22]:
load_dict1 = torch.load('train_model.03.pt')

In [23]:
net = load_dict1['model']
vect = load_dict1['vectorizer']
tf_idf = load_dict1['tf_idf']

In [24]:
test = pd.read_csv("Project_test.csv")
test_labels = pd.read_csv("Project_test_labels.csv")

test_new = test.merge(test_labels,on = 'id')

test_new = test_new[test_new.toxic != -1]
test_new.reset_index( drop = True, inplace = True)

In [25]:
merge1 = test_new.iloc[:,0:2]
corpus1 = merge1.comment_text
clean_corpus1 = corpus1.apply(lambda x :clean(x))
X_test = clean_corpus1
y_test = test_new.toxic

In [26]:
y_test3 = torch.tensor(y_test,dtype = torch.long)

In [27]:
X_test2 = vect.transform(X_test)
X_test2 = tf_idf.transform(X_test2)

In [28]:
X_test3 = vect_to_matrix(X_test2,0,X_test2.shape[0])

100%|██████████| 63978/63978 [00:35<00:00, 1800.27it/s]


In [29]:
def test_loop(batch,batch_y,net):
    net.eval()
    with torch.no_grad():
        out = net(batch)
        
    _,y_predicted = torch.topk(out,1,dim = 1) 
    correct = batch_y == y_predicted.squeeze(1)
   
    
    return y_predicted.squeeze(1)

In [30]:
y_all_pred = []
for i in tqdm(range(0,X_test2.shape[0],4)):
    batch = torch.cat(X_test3[i:i+4])
    batch_y = y_test3[i:i+4]
    y_pred = test_loop(batch,batch_y,net)
    y_all_pred.extend(y_pred.tolist())

100%|██████████| 15995/15995 [00:58<00:00, 272.54it/s]


In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test3.tolist(), y_all_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     57888
           1       0.60      0.72      0.65      6090

    accuracy                           0.93     63978
   macro avg       0.78      0.83      0.81     63978
weighted avg       0.93      0.93      0.93     63978

