<a href="https://colab.research.google.com/github/cmartinezUCSC/Lyrics_Gender_Violence/blob/main/BETO_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.optim import Adam
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#RUTA DEL ARCHIVO CSV EN DRIVE
train_path ='/content/drive/MyDrive/tesis/TrainAugDown.csv'
train_df = pd.read_csv(train_path, sep=";")
train_df.head()

Unnamed: 0,Artista,Cancion,Lyrics,Etiqueta,Expresion
0,badbunny,MIA 708486,\n \nTodos están pendiente a ti (woo) \nPero ...,1,Dile que tú eres mía-mía\nTú sabes que eres mí...
1,antonioaguilar,Eres Zapato Pisado,\n \nAy ay ay! \nChancla de mula de rancho \n...,1,Eres zapato pisado\nPaseado por onde quiera\nY...
2,sodastereo,Coral,\n \nEl reloj \nmarcó la hora del final \nde ...,0,
3,polimawestcoast,Live Fast Die Young,"\n \nEl Ambidieztro \nAll my chain, all my ch...",1,I fuck it your bitch (Ay)\nDon't kiss my lips ...
4,monlaferte,Que S,\n \nQue Sí \nQue Sí son los mismos \nQue Sí ...,0,


In [None]:
test_path ='/content/drive/MyDrive/tesis/TestFinal.csv'
test_df = pd.read_csv(test_path, sep=";")
test_df.head()

Unnamed: 0,Artista,Cancion,Lyrics,Etiqueta,Expresion
0,anagabriel,Por T,\n \nUna noche mas de insomnio \nmi mente...,0,
1,jalvarez,Tentandome,\n \nTentándome \nElla no está hablando pe...,1,"Te robo a tu mujer si me antojo\nEn mi glock, ..."
2,alejandrosanz,Cuando Acabas Tu,"\n \nLa injusticia me mordio, \nen total ...",0,
3,camila,Me Dijiste Aquella Vez,\n \nTu capricho es que yo fuera lo que fui...,0,
4,alejandrofernandez,Nuestro Gran Secreto,\n \nSi supiera con los ojos que la miro \...,1,Si supiera cuánto me muero de ganas\nPor quita...


In [None]:
Model_Name =  'dccuchile/bert-base-spanish-wwm-cased'
tokenizer = AutoTokenizer.from_pretrained(Model_Name)
tokenizer.save_pretrained('content/model')

Downloading (…)okenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

('content/model/tokenizer_config.json',
 'content/model/special_tokens_map.json',
 'content/model/vocab.txt',
 'content/model/added_tokens.json',
 'content/model/tokenizer.json')

In [None]:
class Dataset():

  def __init__(self, df):
    self.df = df
    #Cantidad de tokens
    self.max_len = 400

  def __len__(self):
    return len(self.df)

  def __getitem__(self, item):
    lyric = self.df['Lyrics'].iloc[item].split()
    lyric = ' '.join(lyric)
    etiqueta = int(self.df['Etiqueta'].iloc[item])

    encoding = tokenizer.encode_plus(
        lyric,
        max_length = self.max_len,
        add_special_tokens = True,
        padding = 'max_length',
        truncation = True,
        return_tensors = 'pt'
    )
    return{
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'label': torch.tensor(etiqueta, dtype = torch.long)
    }


In [None]:
#DIVISION DATASET
df_train, df_val = np.split(train_df.sample(frac=1, random_state=12), [int(.7*len(train_df))])
len(df_train), len(df_val)

(525, 225)

In [None]:
class BertClassifier(nn.Module):

  def __init__(self):
    super(BertClassifier, self).__init__()
    self.Bert = AutoModelForMaskedLM.from_pretrained(Model_Name)
    self.densa_inter = nn.Linear(31002, 512)
    self.densa_final = nn.Linear(512, 2)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_id, attention_mask):
    outputs = self.Bert(input_ids= input_id, attention_mask=attention_mask)[0][:,0,:]
    inter_layer = self.relu(self.densa_inter(outputs))
    final_layer = self.sigmoid(self.densa_final(inter_layer))
    return final_layer


In [None]:
def train(model, train_data, test_data, learning_rate, epochs):

  train, test = Dataset(train_data), Dataset(test_data)

  train_dataloader = DataLoader(train, batch_size=15, shuffle = True)
  test_dataloader = DataLoader(test, batch_size=15)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  criterion = nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr=learning_rate)

  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    acc_train = 0
    loss_train = 0

    for batch in tqdm(train_dataloader):
      input_id = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      outputs = model(input_id , attention_mask = attention_mask)

      batch_loss = criterion(outputs, labels)
      loss_train += batch_loss.item()

      acc = (outputs.argmax(dim = 1) == labels).sum().item()
      acc_train += acc

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

    acc_test = 0
    loss_test = 0

    with torch.no_grad():
      preds = []
      total_labels = []
      losses =[]
      accs =[]
      for batch_test in test_dataloader:
        test_labels = batch_test['label'].to(device)
        input_id = batch_test['input_ids'].to(device)
        attention_mask = batch_test['attention_mask'].to(device)

        outputs = model(input_id, attention_mask)

        batch_loss = criterion(outputs, labels)
        loss_test += batch_loss.item()

        acc = (outputs.argmax(dim = 1) == labels).sum().item()
        acc_test += acc

    print(
      f'Epoch: { epoch_num + 1 }| Train Loss:{loss_train/ len(train_data): .3f}\
      |Train Accuracy:{acc_train/ len(train_data): .3f}\
      |Val Loss: {loss_test/len(test_data): .3f}\
      |Val Accuaracy: {acc_test/len(test_data): .3f}'
      )


In [None]:
def evaluate(model, test_data, num):

  test = Dataset(test_data)

  test_dataloader = DataLoader(test, batch_size=30)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()

  acc_test = 0
  true_labels = []
  predicted_labels = []

  with torch.no_grad():
    for batch_test in test_dataloader:
      test_labels = batch_test['label'].to(device)
      input_id = batch_test['input_ids'].to(device)
      attention_mask = batch_test['attention_mask'].to(device)

      outputs = model(input_id, attention_mask)

      _, predicted = torch.max(outputs, dim=1)
      true_labels.extend(test_labels.cpu().numpy())
      predicted_labels.extend(predicted.cpu().numpy())

      acc = (outputs.argmax(dim = 1) == test_labels).sum().item()
      acc_test += acc

  Accuaracy = (acc_test / len(test_data))
  print(f'Test Accuracy: {acc_test / len(test_data): .3f}\n')
  print(Accuaracy)

  cm = confusion_matrix(true_labels, predicted_labels)
  cm[0][0], cm[1][0], cm[0][1], cm[1][1]  = cm[1][1], cm[0][1], cm[1][0], cm[0][0]
  precision = precision_score(true_labels,predicted_labels)
  recall = recall_score(true_labels,predicted_labels)
  fSco = f1_score(true_labels,predicted_labels)
  print(f'Precision:{precision: .3f} Recall:{recall: .3f} F1:{fSco: .3f}')

#Guardar modelo en Drive con formato PT
  if(fSco >= 0.7):
    PATH ='/content/drive/MyDrive/tesis'
    torch.save(model.state_dict(), '/content/drive/MyDrive/tesis/Beto{}.pt'.format(str(num)))
    print("Modelo: ", num )
  else:
    print("---")

  #Matriz grafica
  classes =['0','1']
  df_cm = pd.DataFrame(cm, index = classes, columns = classes)
  plt.figure(figsize = (6,4))
  sns.heatmap(df_cm, annot = True, cmap = "Blues", fmt='.2f')
  plt.title(f'Matriz de Confusion')
  plt.xlabel('Etiqueta predicha')
  plt.ylabel('Etiqueta Real')
  plt.show()
  return Accuaracy, fSco

In [None]:
def iteracion(LR, EPOCHS, num):
  train(model, df_train, df_val, LR, EPOCHS)
  evaluate(model, test_df, num)
  num = num + 1

In [None]:
    #Hiperparametros
num = 0
EPOCHS = 10
LR = 1.519e-05
#ENTRENAMIENTO Y VALIDACION
model = BertClassifier()
print( i,".-LR: ",LR)
train(model, df_train, df_val, LR, EPOCHS)
acc, fsco = evaluate(model, test_df, num)