In [None]:
!pip install transformers
!pip install simpletransformers



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
import re
import sys
from simpletransformers.classification import ClassificationModel, ClassificationArgs
warnings.filterwarnings('ignore')

## Import the dataset

Here we import the dataset and reorganize the dataframes to fit the transformers input.

*Important: 0 represents SARCASM and 1 is NOT_SARCASM*

In [None]:
url = 'https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/train.jsonl'
df = pd.read_json(url, lines=True)

url_test = 'https://raw.githubusercontent.com/CS410Fall2020/ClassificationCompetition/main/data/test.jsonl'
df_test = pd.read_json(url_test, lines=True)


# We dont use the context
del df["context"]
del df_test["context"]

# TEST
df_test_id = df_test["id"]
df_test = df_test["response"]
print(df_test)

# Place the label column at the end
response = df['response']
df.drop(labels=['response'], axis=1,inplace = True)
df.insert(0, 'response', response)

# Labels to numbers
print(df[0:3])
print(df[3000:3003])
df['label'] = pd.factorize(df['label'])[0]
print(df[0:3])
print(df[3000:3003])

0       @USER @USER @USER My 3 year old , that just fi...
1       @USER @USER How many verifiable lies has he to...
2       @USER @USER @USER Maybe Docs just a scrub of a...
3       @USER @USER is just a cover up for the real ha...
4       @USER @USER @USER The irony being that he even...
                              ...                        
1795    @USER @USER @USER is definitely the best out t...
1796    @USER @USER Ye let her out run wild and infect...
1797    @USER @USER @USER Thanks for that , I would ha...
1798    @USER @USER @USER Yes also #found this on #new...
1799    @USER @USER @USER you still need to send the l...
Name: response, Length: 1800, dtype: object
                                            response    label
0  @USER @USER @USER I don't get this .. obviousl...  SARCASM
1  @USER @USER trying to protest about . Talking ...  SARCASM
2  @USER @USER @USER He makes an insane about of ...  SARCASM
                                               response        label
3

### Data cleaning
Here we clean the dataset. We perform the same cleaning in train and test dataframes. 

In [None]:
#TRAIN
df['response'] = df['response'].apply((lambda x: x.replace('@USER', '')))
df['response'] = df['response'].apply((lambda x: x.replace('<URL>', '')))
df['response'] = df['response'].apply((lambda x: x.strip()))
df['response'] = df['response'].apply((lambda x: x.replace('  ', ' ')))
df['response'] = df['response'].apply((lambda x: x.replace('  ', ' ')))
df['response'] = df['response'].apply((lambda x: x.replace('  ', ' ')))
df['response'] = df['response'].apply((lambda x: x.replace('#', '')))

#TEST

df_test = df_test.apply((lambda x: x.replace('@USER', '')))
df_test = df_test.apply((lambda x: x.replace('<URL>', '')))
df_test = df_test.apply((lambda x: x.strip()))
df_test = df_test.apply((lambda x: x.replace('  ', ' ')))
df_test = df_test.apply((lambda x: x.replace('  ', ' ')))
df_test = df_test.apply((lambda x: x.replace('  ', ' ')))
df_test = df_test.apply((lambda x: x.replace('#', '')))



### Split training data to evaluate our models

In [None]:
# SPLIT TRAIN

split = np.random.rand(len(df)) < 0.8

df_train = df[split]
df_eval = df[~split]

print(df_train)
print(df_eval)

                                               response  label
0     I don't get this .. obviously you do care or y...      0
1     trying to protest about . Talking about him an...      0
2     He makes an insane about of money from the MOV...      0
4     Pretty Sure the Anti-Lincoln Crowd Claimed Tha...      0
5     -> per your tag line : never judge a book by i...      0
...                                                 ...    ...
4995  You don't . I have purchased a lot on Amazon (...      1
4996  Emotions you say 🤔 never knew that I think I ’...      1
4997  You are so right ... " Yes ! Silence is not Pr...      1
4998  Another lazy delusional voter who takes the wo...      1
4999  I hope you know no news outlet from Nigeria ha...      1

[4046 rows x 2 columns]
                                               response  label
3     Meanwhile Trump won't even release his SAT sco...      0
12    Hey , but what do they have to lose ? Asking f...      0
16    I remember a few months 

### THE MODEL

To try different models just change the first two parameters for the ClassificationModel constructor and use https://huggingface.co/transformers/pretrained_models.html. 

In [None]:
from simpletransformers.classification import ClassificationModel

# Create a ClassificationModel
model = ClassificationModel('roberta', 'roberta-base', num_labels=2, use_cuda=True)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




### Train

In [None]:
model.train_model(df_train)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4046.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=506.0), HTML(value='')))





(506, 0.5398496986171709)

### Evaluate the model.

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(df_eval)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=954.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=120.0), HTML(value='')))




In [None]:
print(result)
#print(model_outputs)
#print(result)

{'mcc': 0.5666282923825283, 'tp': 347, 'tn': 399, 'fp': 80, 'fn': 128, 'eval_loss': 0.45409497395157816}


In [None]:
from sklearn.metrics import f1_score, accuracy_score


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    
result, model_outputs, wrong_predictions = model.eval_model(df_eval, f1=f1_multiclass, acc=accuracy_score)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=954.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=120.0), HTML(value='')))




In [None]:
print(result)

{'mcc': 0.5666282923825283, 'tp': 347, 'tn': 399, 'fp': 80, 'fn': 128, 'f1': 0.7819706498951783, 'acc': 0.7819706498951782, 'eval_loss': 0.45409497395157816}


### PREDICT AND GET ANSWER.TXT

Here we can either use the trained model in the previous step or train again with the entire training dataset. 

In [None]:
### (IF YOU WANT TO RETRAIN WITH THE ENTIRE TRAIN DATA)

model_args = ClassificationArgs()
model_args.num_train_epochs = 3
model_args.overwrite_output_dir = True

model_final = ClassificationModel('roberta', 'roberta-base', num_labels=2, use_cuda=True, args=model_args)
model_final.train_model(df)
predictions, raw_outputs = model_final.predict(df_test)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 3'), FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 1 of 3'), FloatProgress(value=0.0, max=625.0), HTML(value='')))




In [None]:
### (IF YOU WANT TO USE THE TRAINED MODEL WITH TRAIN DATA SPLIT)
predictions, raw_outputs = model.predict(df_test)

print(predictions)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1800.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=225.0), HTML(value='')))


[1 0 0 ... 0 1 1]


In [None]:
# CREATE FILE
from google.colab import files

with open('answer.txt', 'w') as writefile:
    n = 0
    sarcasm = 0
    not_sarcasm = 0
    for x in predictions:
      n = n + 1
      if x == 1:
        writefile.write("twitter_{},NOT_SARCASM\n".format(n))
        sarcasm += 1
      else:
        writefile.write("twitter_{},SARCASM\n".format(n))
        not_sarcasm +=1

    print(sarcasm)
    print(not_sarcasm) 

719
1081


In [None]:
# DOWNLOAD FILE
files.download('answer.txt') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>