In [30]:
from transformers import BertModel,BertTokenizer,BertConfig,get_linear_schedule_with_warmup

#### Creating BERT Neural Network

In [2]:
import torch 
import torch.nn as nn

In [42]:
class ToxicityModel(nn.Module):
    def __init__(self, bert_model):
        super(ToxicityModel,self).__init__()
        
        self.bert_model = bert_model
                
        self.l1 = nn.Linear(768,256)  ## Reducing the Vector Dimension
        self.dropout = nn.Dropout(0.2)
        
        ## ['target','severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
        self.toxicity = nn.Linear(256,6)  ## 6 classes
        
        self.bert_model.train() ## Setting up bert model on training mode by default
        
    def forward(self,**kwargs):
        
        hc,_ = self.bert_model(**kwargs,return_dict = False)
        x = hc[:,0,:]
        x = self.dropout(self.l1(x))
        x = self.toxicity(x)
        
        return torch.sigmoid(x)
    
    def training_step(self,input,label,loss_fn):
        
        out = self(**input)
        loss = loss_fn(out,label)
        
        return loss
        
        

In [12]:
bert_model = BertModel.from_pretrained("../bert_model")

In [13]:
# bert_config = BertConfig.from_pretrained("../bert_model/")
# bert_model = BertModel(config=bert_config)

In [43]:
model = ToxicityModel(bert_model=bert_model)

In [15]:
tokenizer = BertTokenizer.from_pretrained("../bert_model",do_lower = True)

In [16]:
text = tokenizer(
                    "Hello! How are you!",padding='max_length',
                    max_length = 128,return_tensors = "pt"

)

In [17]:
out = model(**text)

In [18]:
assert out.shape == torch.Size([1,6])

### DataLoading 

In [19]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [48]:
class ToxicityDataset(Dataset):
    def __init__(self,data_path,tokenizer,max_length = 128):
        ## Initializing some variables in the constructor
        self.data = pd.read_csv(data_path)
        self.tokenizer = tokenizer
        self.max_length = 128
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        
        ## Accessing the single item
        item = self.data.iloc[idx]
        
        ## The input comment text
        comment_text = item['comment_text']
        
        ## The output labels
        toxicity = item['target_label']
        severe_toxicity = item['severe_toxicity']
        obscene = item['obscene']
        identity_attack = item['identity_attack']
        insult = item['insult']
        threat = item['threat']
        
        ## tokenizing the text
        input_tensors = tokenizer(comment_text,padding="max_length",\
                                    max_length=self.max_length,truncation=True,\
                                         return_tensors = "pt")
        
        ## Reducing a dimension for each key
        input_tensors = {k:v.squeeze(0) for k,v in input_tensors.items()}
        
        ## Processing the output labels
        labels = [toxicity,severe_toxicity,obscene,identity_attack,insult,threat]
        labels = torch.tensor(labels,dtype=torch.float32)
        
        ## returning the result
        return {"input":input_tensors,"labels":labels}

In [36]:
unittest_dataset = ToxicityDataset("../data/train_split.csv",tokenizer=tokenizer,max_length=128)

In [37]:
out = unittest_dataset[0]

In [41]:
assert out['input']['input_ids'].shape == torch.Size([128]), "Incorrect Max length generated from Dataloader"

In [44]:
assert out['labels'].shape == torch.Size([6]), "Incorrect Number of labels generated from the Dataloader"

In [46]:
# model = BertModel.from_pretrained("../bert_model/",)

OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ../bert_model/.

### Model Training Loop

In [22]:
from torch.utils.data import DataLoader

In [35]:
batch_size = 16
epochs = 100
device  = "cpu"

In [49]:
### Dataset
train_dataset = ToxicityDataset("../data/train_split.csv",tokenizer=tokenizer)
train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)

### Validation
valid_dataset = ToxicityDataset("../data/validate.csv",tokenizer=tokenizer)
valid_dataloader = DataLoader(valid_dataset,batch_size=batch_size)


In [32]:
lr = 0.001
total_steps = (len(train_dataloader) //batch_size) * epochs

num_warmup_steps = total_steps//5

In [27]:
optimizer = torch.optim.Adam(model.parameters(),lr = 0.0001 )

In [33]:
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=num_warmup_steps,
  num_training_steps=total_steps
)

In [34]:
loss_fn = nn.BCELoss()

In [50]:
log_train_loss = []
log_val_loss = []
best_loss = -999


train_losses = 0
valid_losses = 0 
model.train()
for batch in train_dataloader:
    
    batch['input'] = {k:v.to(device) for k,v in batch['input'].items()}
    batch['labels'] = batch['labels'].to(device)
    
    loss = model.training_step(batch['input'],batch['labels'],loss_fn)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()
    
    train_losses+=loss.detach().cpu().item()
    break
    
log_train_loss.append(train_losses/len(train_dataloader))

model.eval()
for batch in valid_dataloader:
    
    batch['input'] = {k:v.to(device) for k,v in batch['input'].items()}
    batch['labels'] = batch['labels'].to(device)        
    loss = model.training_step(batch['input'],batch['labels'],loss_fn)

    valid_losses+=loss.detach().cpu().item()
    
log_val_loss.append(valid_losses/len(valid_dataloader))


if log_val_loss[-1] < best_loss:
    best_loss = log_val_loss[-1]
    torch.save(model.parameters(),"../model/best.pt")

if (i % 5 == 0):
    print(f"Train loss : {log_train_loss[-1]}  Valid loss : {log_val_loss[-1]}")
    





tensor(0.6494, grad_fn=<BinaryCrossEntropyBackward0>)


In [51]:
train_data = pd.read_csv("../data/train_split.csv")

In [52]:
train_data.head()

Unnamed: 0,id,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,target_label
0,6258663,"Like you , Peter , I wish others not on the wa...",0.0,0.0,0.0,0.0,0.0,0
1,345645,`` I wont support anyone wayerhouser sells to ...,0.1,0.0,0.1,0.3,0.6,1
2,5283043,The number one thing a con con can do is de ce...,0.0,0.0,0.0,0.0,0.0,0
3,649896,"Yes , but have they been CONVINCINGLY answered...",0.0,0.0,0.0,0.0,0.0,0
4,6302894,See also Sinclair Lewis prescient 1935 novel ...,0.0,0.0,0.0,0.0,0.0,0


In [None]:
train_data['']

### Splitting the data

In [1]:
import torch
import pandas as pd
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("../data/train.csv")

In [6]:
size = int(len(data) * 0.10)

In [14]:
total_size = range(0,len(data))

In [10]:
import random

In [16]:
x = random.sample(total_size,k = size)

In [18]:
new_data = data.iloc[x]

In [19]:
new_data.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
1338812,5751593,0.552632,I just find phony conservatives amazing in not...,0.0,0.065789,0.065789,0.539474,0.0,0.0,0.0,...,365468,approved,1,0,0,1,4,0.0,4,76
1629736,6118151,0.0,IMDb.com tells me that the Hollywood lefties l...,0.0,0.0,0.0,0.0,0.0,,,...,387336,approved,2,0,1,1,4,0.0,0,4
276045,580412,0.2,"Pepe, don't forget Lisa Murkowski's wholesale ...",0.1,0.0,0.0,0.1,0.1,,,...,150726,approved,0,0,0,0,0,0.0,0,10
59941,315362,0.0,“We change our tax as much as we change our un...,0.0,0.0,0.0,0.0,0.0,,,...,97560,approved,0,0,0,7,0,0.0,0,4
837369,5145294,0.0,Just bought a Silverado with 212 k. Chevy is ...,0.0,0.0,0.0,0.0,0.0,,,...,327656,approved,1,0,1,3,0,0.0,0,4


In [20]:
new_data.to_csv("../data/sample.csv")

### Developing the evaluate function

In [42]:
from sklearn.metrics import accuracy_score,f1_score,classification_report

In [43]:
def evaluate(predictions,labels,threshold):
    
    norm = torch.where(predictions>=threshold,1,0)
    accuracy,f1 = accuracy_score(predictions,labels),f1_score(predictions,labels,average="micro")
    
    lb_name = ['toxicity','severe_toxicity','obscene','identity_attack','insult','threat']
    report = classification_report(labels,norm, target_names=lb_name)
    
    return accuracy,f1,report

In [39]:
x = torch.rand(5,6)

In [40]:
label = torch.where(x > 0.5,1,0)
predictions = torch.where(torch.rand(5,6) > 0.5,1,0)

In [45]:
_,_,report = evaluate(predictions,label,0.5)

In [47]:
print(report)

                 precision    recall  f1-score   support

       toxicity       0.67      0.67      0.67         3
severe_toxicity       1.00      1.00      1.00         3
        obscene       0.50      0.67      0.57         3
identity_attack       0.50      0.50      0.50         2
         insult       0.33      1.00      0.50         1
         threat       0.67      0.67      0.67         3

      micro avg       0.61      0.73      0.67        15
      macro avg       0.61      0.75      0.65        15
   weighted avg       0.66      0.73      0.68        15
    samples avg       0.60      0.77      0.65        15



In [22]:
import boto3
import os

In [27]:
ACCESS_KEY = 'AKIAW3CEOBGF6VFBIC7S'
SECRET_KEY = '/sVaOyRGZbDAcp2rWtT+h89/JQ61AR65mZJ7iLu1'
bucket_name = "toxic-comments19032023"
session = boto3.Session(
    aws_access_key_id=ACCESS_KEY, 
    aws_secret_access_key=SECRET_KEY
)

In [26]:
def upload_file(session,file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except:
        print(f"Couldn't upload the file")
        return "Failed"

    return "Upload success"

In [28]:
upload_file(session,"../model",bucket_name,"best_weights")

Couldn't upload the file


'Upload success'

In [5]:
bucket = s3.Bucket('toxic-comments19032023')

In [15]:
for object in bucket.objects.all():
    print(object.get()['Body'].read())



In [17]:
object = s3.ObjectSummary(bucket_name ='toxic-comments19032023',key = 'test.csv')

In [20]:
s3.download_file(bucket_name ='toxic-comments19032023',key = 'test.csv')

AttributeError: 's3.ServiceResource' object has no attribute 'download_file'