In [None]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [137]:
import torch
from transformers import BertTokenizer
import os
import time
import pandas as pd
from datasets import load_metric
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
import csv

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colbert_dataset/dataset.csv")
len(df)

200000

In [None]:
df.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [None]:
df["label"] = df["humor"].astype(int)
df.head()

Unnamed: 0,text,humor,label
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False,0
1,Watch: darvish gave hitter whiplash with slow ...,False,0
2,What do you call a turtle without its shell? d...,True,1
3,5 reasons the 2016 election feels so personal,False,0
4,"Pasco police shot mexican migrant from behind,...",False,0


In [40]:
df_train_200 = df

In [41]:
test_size = 0.2
df_train, df_test = train_test_split(df, test_size=0.05)
print(len(df_train), len(df_test))

190000 10000


In [111]:
df_train_sample = df_train[:10]
len(df_train_sample)

10

In [112]:
df_test_sample = df_test[:10]
len(df_test_sample)

10

In [44]:
df_test = df_test.drop(['label', 'humor'], axis=1)
df_test.head()

Unnamed: 0,text
187168,U.s. hunter tied to killing of cecil the lion ...
52304,Ron jeremy banned from porn awards after admit...
89378,What did the zero say to the eight? nice belt!
194235,Gabby giffords is ready to campaign for hillar...
1916,A recent study shows that 51.9% of the uk are ...


In [45]:
df_test_sample = df_test_sample.drop(['label', 'humor'], axis=1)

In [47]:
bert_model = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model)

In [48]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
df_train.iloc[0,0]

'Lawmakers are again fighting over drilling in the fragile arctic wildlife refuge'

In [50]:
sent_tokenize(df_train.iloc[0,0])

['Lawmakers are again fighting over drilling in the fragile arctic wildlife refuge']

In [51]:
MAX_SENTENCE_LENGTH = 20
MAX_SENTENCES = 5
MAX_LENGTH = 100

In [52]:
list(df_train.columns)

['text', 'humor', 'label']

In [53]:
output_categories = list(df_train.columns[[2]])
input_categories = list(df_train.columns[[0]])
TARGET_COUNT = len(output_categories)
print(input_categories, output_categories, TARGET_COUNT)

['text'] ['label'] 1


In [90]:
def return_id(str1, str2, truncation_strategy, length):

    inputs = tokenizer.encode_plus(str1, str2,
        add_special_tokens=True,
        max_length=length,
        truncation=True,
        truncation_strategy=truncation_strategy)
    # print(inputs)
    input_ids =  inputs["input_ids"]
    input_masks = [1] * len(input_ids)
    input_segments = inputs["token_type_ids"]
    padding_length = length - len(input_ids)
    padding_id = tokenizer.pad_token_id
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)

    return [input_ids, input_masks, input_segments]

In [106]:
def compute_input_arrays(df, columns, tokenizer):
    model_input = []
    # creating empty arrays 18 rows
    for xx in range((MAX_SENTENCES*3)+3):
        model_input.append([])


    for _, row in tqdm(df[columns].iterrows()):
        i = 0

        # sent
        sentences = sent_tokenize(row.text)
        # print(sentences)
        for xx in range(MAX_SENTENCES):
            s = sentences[xx] if xx<len(sentences) else ''
            # print(s)
            ids_q, masks_q, segments_q = return_id(s, None, 'longest_first', MAX_SENTENCE_LENGTH)
            # print(ids_q, masks_q, segments_q)
            model_input[i].append(ids_q)
            i+=1
            model_input[i].append(masks_q)
            i+=1
            model_input[i].append(segments_q)
            i+=1

        # full row
        ids_q, masks_q, segments_q = return_id(row.text, None, 'longest_first', MAX_LENGTH)
        model_input[i].append(ids_q)
        i+=1
        model_input[i].append(masks_q)
        i+=1
        model_input[i].append(segments_q)

    for xx in range((MAX_SENTENCES*3)+3):
        model_input[xx] = np.asarray(model_input[xx], dtype=np.int32)

    # print(model_input[0].shape)
    return model_input

In [113]:
sample_inputs = compute_input_arrays(df_train_sample, input_categories, tokenizer)
sample_test_inputs = compute_input_arrays(df_test_sample, input_categories, tokenizer)

10it [00:00, 643.50it/s]
10it [00:00, 714.63it/s]


In [114]:
print(len(sample_inputs), len(sample_inputs[0]), len(sample_inputs[0][0]))

18 10 20


In [123]:
for i in range(len(sample_inputs)):
  print(sample_inputs[i].shape)

(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 20)
(10, 100)
(10, 100)
(10, 100)


In [139]:
with open('sample_input.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(sample_inputs)

In [140]:
df_train_200.shape

(200000, 3)

In [141]:
inputs = compute_input_arrays(df_train_200, input_categories, tokenizer)

200000it [06:01, 553.75it/s]


In [67]:
print(len(inputs), len(inputs[0]), len(inputs[0][0]))

18 10 20


In [142]:
with open('input_200k.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(inputs)

In [143]:
inputs[0].shape

(200000, 20)

In [144]:
inputs[0][0]

array([  101,  3533,  7226,  2368,  3513,  2041, 12609,  7226,  1024,
        1005,  4364,  1010,  1045,  1005,  1049,  2025,  2770,  1005,
         102,     0], dtype=int32)

In [145]:
inputs[3][0]

array([101, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [148]:
def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [149]:
outputs = compute_output_arrays(df_train_200, output_categories)

In [151]:
outputs.shape

(200000, 1)

In [152]:
valid_inputs = inputs
valid_outputs = outputs