In [1]:
from google.colab import drive
import sys
drive.mount('/content/drive')
sys.path.insert(0,'/content/drive/MyDrive/')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/NLPClassification_48'

/content/drive/MyDrive/NLPClassification_48


In [3]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 KB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting streamlit
  Downloading streamlit-1.19.0-py2.py3-none-any.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencep

In [15]:
from sklearn.metrics import classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
import torch
import transformers
import torch.nn as nn
from dont_patronize_me import DontPatronizeMe

if not torch.cuda.is_available():
  print('WARNING: You may want to change the runtime to GPU for faster training!')
  DEVICE = 'cpu'
else:
  DEVICE = 'cuda:0'

# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


dpm = DontPatronizeMe('', '') #if you are getting an error here be sure to check path in dont_patronize_me.py to make sure its accessing the right file
dpm.load_task1()

trids = pd.read_csv('./data/train_semeval_parids-labels.csv')
teids = pd.read_csv('./data/dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

data=dpm.train_task1_df
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

trdf1 = pd.DataFrame(rows)


rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

tedf1 = pd.DataFrame(rows)

pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])


task1_model_args = ClassificationArgs(num_train_epochs=5,
                                      eval_batch_size=32,
                                      learning_rate= 4e-5,
                                      train_batch_size=16,
                                      gradient_accumulation_steps= 2,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)

task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2)
                                  #use_cuda= cuda_available)


# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())


dev_correct_labels = tedf1.label.tolist()
report = classification_report(dev_correct_labels, preds_task1, target_names=["negative", "positive"], output_dict=True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

  0%|          | 0/2382 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

In [16]:
report

{'negative': {'precision': 0.9704690214244355,
  'recall': 0.8844327176781003,
  'f1-score': 0.9254555494202099,
  'support': 1895},
 'positive': {'precision': 0.4032697547683924,
  'recall': 0.7437185929648241,
  'f1-score': 0.5229681978798587,
  'support': 199},
 'accuracy': 0.8710601719197708,
 'macro avg': {'precision': 0.686869388096414,
  'recall': 0.8140756553214622,
  'f1-score': 0.7242118736500343,
  'support': 2094},
 'weighted avg': {'precision': 0.9165661302761297,
  'recall': 0.8710601719197708,
  'f1-score': 0.8872057963368623,
  'support': 2094}}

In [17]:
#identify misclassified data from devset for analysis 
misclassified = pd.DataFrame()

for idx in range(len(tedf1)):
    #misclassified at idx
    if dev_correct_labels[idx] != preds_task1[idx]:
        misclassified = misclassified.append(tedf1.loc[idx])  #appending values from dev set of misclassified text



In [19]:
misclassified #use this dataframe to analyze misclassifications

Unnamed: 0,par_id,community,text,label
0,4046,hopeless,We also know that they can benefit by receivin...,1.0
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1.0
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1.0
5,432,refugee,He depicts demonstrations by refugees at the b...,1.0
8,2001,poor-families,t is remiss not to mention here that not all s...,1.0
...,...,...,...,...
2047,10417,disabled,According to an inside source within the South...,0.0
2064,10435,homeless,They lamented that they have been rendered hom...,0.0
2071,10442,hopeless,"If we use depression as an example , this is a...",0.0
2073,10444,poor-families,"Desertification which affects Yunusari , Yusuf...",0.0


In [22]:
labels2file([[k] for k in preds_task1], 'dev.txt')

In [21]:
#run predictions test data and save to file 
dpm_test = DontPatronizeMe('', 'task4_test.tsv') 
dpm_test.load_test()
preds_test, _ = task1_model.predict(dpm_test.test_set_df.text.to_list())
   
labels2file([[k] for k in preds_test], 'test.txt')
   

  0%|          | 0/3832 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]