In [2]:
import json
import os
import os.path as osp
import numpy as np
from google.colab import drive
import torch
from torchvision.datasets import Flowers102
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
pip install evaluate

In [4]:
#from Huggingface
! pip install -q datasets 
! pip install -q evaluate 

In [None]:
pip install --no-cache-dir transformers sentencepiece

In [31]:
import evaluate
import pandas as pd
import re
import unicodedata
import transformers
import tensorflow as tf
from datasets import Dataset, DatasetDict
from transformers import TrainingArguments,Trainer
from transformers import TFAutoModel, AutoTokenizer
from transformers import DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.optimizers.schedules import ExponentialDecay

In [7]:
drive.mount('/content/drive')
datadir = "/content/drive/My Drive/HW3/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import warnings, logging
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

# **1. distilbert-base-uncased model**
 

In [9]:
os.chdir(datadir)
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
X_train = train.drop(columns=["keyword", "location", "target"])
y_train = train["target"]

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.25, random_state = 42)

In [33]:
model_name="distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [13]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
train_data.head()

Unnamed: 0,id,text,target
5151,7345,@dicehateme @PuppyShogun This makes sense. Pap...,0
6351,9081,'@CatoInstitute: The causes of federal failure...,0
3443,4920,Well as I was chaning an iPad screen it fuckin...,0
7164,10265,the war on drugs has turned the U.S. into a WA...,1
7037,10083,Obama Declares Disaster for Typhoon-Devastated...,1


In [14]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset=Dataset.from_pandas(val_data)
train_dataset = train_dataset.remove_columns(['__index_level_0__'])
val_dataset = val_dataset.remove_columns(['__index_level_0__'])

In [15]:
dd = DatasetDict({"train": train_dataset, "val":val_dataset})

In [16]:
dd.set_format("pandas")

In [18]:
def tokenize_batch(batch):
    encodings = tokenizer(
    batch["text"].tolist(),
    padding=True,
    return_tensors="tf",
    )
    batch_dict = {key: value.tolist() for key, value in batch.items() if key in ['id','target']}
    encodings_dict = {key:value.numpy().tolist() for key, value in encodings.items()}
    return {**batch_dict, **encodings_dict}

In [21]:
data_encoded = dd.map(tokenize_batch, batched=True, batch_size =None)
data_encoded.set_format("tf")

Map:   0%|          | 0/5709 [00:00<?, ? examples/s]

Map:   0%|          | 0/1904 [00:00<?, ? examples/s]

In [34]:
class DistilBertForTweetClassification(tf.keras.Model):
    def __init__(self, model_name: str , num_classes: int):
        super().__init__()
        self.transformer = TFDistilBertModel.from_pretrained(model_name)
        self.dense = tf.keras.layers.Dense(512, activation="relu")
        self.dense2 = tf.keras.layers.Dense(64, activation="relu")
        self.dropout = tf.keras.layers.Dropout(0.4)
        self.output_layer = tf.keras.layers.Dense(num_classes, activation="sigmoid")
    
    def call(self,x):
        outputs = self.transformer(x)
        cls_last_hidden = outputs.last_hidden_state[:,0,:]
        x = self.dropout(cls_last_hidden)
        x = self.dense(x)
        x = self.dense2(x)
        x = self.output_layer(x)
        
        return x

In [23]:
def convert_to_tf_dataset(dataset):
    input_ids = dataset["input_ids"]
    attention_mask = dataset["attention_mask"]
    target = dataset["target"]
    return {"input_ids":input_ids, "attention_mask":attention_mask}, target

In [24]:
train_data_inputs = tf.data.Dataset.from_tensor_slices(convert_to_tf_dataset(data_encoded["train"]))
val_data_inputs = tf.data.Dataset.from_tensor_slices(convert_to_tf_dataset(data_encoded["val"]))

In [25]:
batch_size =32
buffer_size=500
train_data_inputs = train_data_inputs.shuffle(buffer_size).batch(batch_size)
val_data_inputs = val_data_inputs.batch(batch_size)

In [35]:
model = DistilBertForTweetClassification(model_name, 1)

In [36]:
early_stopping = EarlyStopping(
  monitor="val_loss",
  patience=2,
  verbose=1,
  restore_best_weights=True
)

class LearningRatePrinter(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        lr = self.model.optimizer.learning_rate.numpy()
        print(f"Learning rate for epoch {epoch + 1}: {lr}")
        
lr_printer = LearningRatePrinter()

In [37]:
lr_schedule = ExponentialDecay(1e-5, len(train_dataset), 1e-4)

In [38]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy"],
)

In [None]:
history = model.fit(train_data_inputs,
          epochs=10,
          validation_data=val_data_inputs,
          callbacks=[early_stopping, lr_printer])

Learning rate for epoch 1: 9.999999747378752e-06
Epoch 1/10
Learning rate for epoch 2: 7.5038578870589845e-06
Epoch 2/10

In [None]:
os.chdir(datadir)
test = pd.read_csv("test.csv")
test_data = test[["id","text"]]
encoded_texts = tokenizer(
list(test_data["text"].values),
padding=True,
return_tensors="tf",
)

In [None]:
inputs = (
encoded_texts["input_ids"],
encoded_texts["attention_mask"],
)

In [None]:
pred = model.predict(inputs)



In [None]:
test_data["target"] = np.round(pred).astype(int)
test_data.head()

Unnamed: 0,id,text,target
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1


In [None]:
final_output = test_data[["id","target"]]
final_output.to_csv("submission.csv", index= False)

In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Accuracy: 83.29

# **2. Microsoft DeBERTa model**

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast =False)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
os.chdir(datadir)
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
train['target'] = train['target'].astype(float)
train = train.fillna('[N]')
train['input'] = "[CLS]" + train.text

In [None]:
ds = Dataset.from_pandas(train)
ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target'],
    num_rows: 7613
})

In [None]:
def tok_func(x): 
  return tokenizer(x["input"])

In [None]:
#Parallel processing
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

In [None]:
row = tok_ds[0]
row['input'], row['input_ids']

('[CLS]Our Deeds are the Reason of this [HB]earthquake[HE] May ALLAH Forgive us all',
 [1,
  1,
  581,
  65453,
  281,
  262,
  18037,
  265,
  291,
  128004,
  10612,
  128005,
  903,
  4924,
  17018,
  43632,
  381,
  305,
  2])

In [None]:
tok_ds = tok_ds.rename_columns({'target':'labels'})

In [None]:
#Creating training and validation set
#dds = DictionaryDataset
dds = tok_ds.train_test_split(test_size=0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5709
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1904
    })
})

Model training

In [None]:
preds_cb = np.arange(1)
labels_cb = np.arange(1)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("f1")
    preds, labels = eval_preds
    if len(preds) > 0:
        global preds_cb
        global labels_cb
        preds_cb = np.copy(preds)
        labels_cb = np.copy(labels)   
    return metric.compute(predictions=preds, references=labels)

In [None]:
#Parameters for training
lr = 5e-6
bs = 4 
gradient_accumulation_steps = 1
epochs = 2
bs2 = 4
gradient_accumulation_steps2 = 1
epochs2 = 1

In [None]:
def train_model(dataset, tokenizer, seed=42, lr=5e-6, epochs=2, bs=4,
                model_name='microsoft/deberta-v3-large', gradient_accumulation_steps=1,
                reshuffle=0, epochs2=1, bs2=4, gradient_accumulation_steps2=1, fp16=True):
    
    dds = dataset.train_test_split(0.25, seed=seed)
    
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1,
                             lr_scheduler_type='cosine', fp16=True,
                             evaluation_strategy="epoch", per_device_train_batch_size=bs,
                             per_device_eval_batch_size=bs*2, num_train_epochs=epochs,
                             weight_decay=0.01, report_to='none',
                             gradient_accumulation_steps=gradient_accumulation_steps,
                             save_steps=-1)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
    
    trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)
    
    trainer.train()
    model = trainer.model 
    dds = tok_ds.train_test_split(0.25, seed=42)
    epochs=epochs2
    args = TrainingArguments('outputs', learning_rate=lr/2, warmup_ratio=0.1,
                                 lr_scheduler_type='cosine', fp16=True,
                                 evaluation_strategy="epoch", per_device_train_batch_size=bs2,
                                 per_device_eval_batch_size=bs2*2, num_train_epochs=epochs2,
                                 weight_decay=0.01, report_to='none',
                                 gradient_accumulation_steps=gradient_accumulation_steps2,
                                 save_steps=-1)
    trainer = Trainer(model, args, train_dataset=dataset, eval_dataset=dds['test'],
                      tokenizer=tokenizer, compute_metrics=compute_metrics)
    trainer.train()
    return trainer

In [None]:
trainer = train_model(dataset=tok_ds, tokenizer=tokenizer, lr=lr, seed=42,
                          epochs=epochs, bs=bs, model_name=model_name,
                          gradient_accumulation_steps=gradient_accumulation_steps,
                          epochs2=epochs2, bs2=bs2,
                          gradient_accumulation_steps2=gradient_accumulation_steps2, fp16=True)

Epoch,Training Loss,Validation Loss,F1
1,0.1585,0.152358,0.715373
2,0.1206,0.150801,0.750363


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1
1,0.1157,0.125995,0.761062


In [None]:
trainer.evaluate()

{'eval_loss': 0.12599541246891022,
 'eval_f1': 0.7610619469026549,
 'eval_runtime': 22.1327,
 'eval_samples_per_second': 86.027,
 'eval_steps_per_second': 10.753,
 'epoch': 1.0}

In [None]:
os.chdir(datadir)
eval_df = pd.read_csv('test.csv')
eval_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
eval_df = clean_df(eval_df)

In [None]:
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [None]:
predictions = torch.empty((0,len(eval_df)))
predictions

tensor([], size=(0, 3263))

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
prediction = torch.tensor(preds)
predictions = torch.cat([predictions,prediction.T],dim=0)

In [None]:
preds = predictions.numpy()

In [None]:
preds2 = [ 1 if element > 0.6 else 0 for element in preds.squeeze()]
preds = preds2

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'target': preds
})
submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

22746

In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Accuracy: 84.55

# **3. Microsoft DeBERTa model on cleaned data**

In [None]:
#Predefined special tokens
tokenizer.all_special_tokens

['[CLS]',
 '[SEP]',
 '[UNK]',
 '[PAD]',
 '[MASK]',
 '[MB]',
 '[ME]',
 '[U]',
 '[HB]',
 '[HE]']

In [None]:
new_token_list = [
    "[MB]", #mention beginning
    "[ME]", #mention end
    "[U]",  #URL
    "[HB]", #hashtag beginning
    "[HE]"  #hashtag end 
    ]

tokenizer.add_special_tokens({'additional_special_tokens': new_token_list})

0

In [None]:
os.chdir(datadir)
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
train['target'] = train['target'].astype(float)

Data Cleaning

In [None]:
def replace_url(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Replaces URLs in the specified column of a pandas DataFrame with the '[U]' placeholder.
    
    Args:
        df (pandas.DataFrame): The DataFrame containing the column to modify.
        column (str): The name of the column to modify.
        
    Returns:
        pandas.DataFrame: The modified DataFrame with URLs replaced with '[U]'.
    """
    df[column] = df[column].replace(r"(http|https)://[^\s]+", "[U]", regex=True)
    return df

In [None]:
def wrap_urls(df, col_name):
    def wrap_url(match):
        return "[UB]{}[UE]".format(match.group())
    df[col_name] = df[col_name].apply(lambda x: re.sub(r"(https?://\S+)", wrap_url, x))
    return df

In [None]:
def replace_mentions(df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Use a regular expression to match mentions, which start with "@" followed by letters, numbers, or underscores
    df[column] = df[column].str.replace(r"@\w+", "[M]")
    return df

In [None]:
def wrap_mentions(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    def replace_mention(match):
        return f"[MB]{match.group()}[ME]"
    df[col_name] = df[col_name].apply(lambda x: re.sub(r"@\w+", replace_mention, x))
    return df

In [None]:
def wrap_hashtags(df: pd.DataFrame, column: str) -> pd.DataFrame:
    def replace_hashtag(match):
        return "[HB]{}[HE]".format(match.group())
    df[column] = df[column].apply(lambda x: re.sub(r"#\w+", replace_hashtag, x))
    return df

In [None]:
def remove_non_ascii(df, column):
    df[column] = df[column].apply(lambda x: "".join(char for char in x if ord(char) < 128))
    return df

In [None]:
def remove_nan(df, column):
    """Remove all occurrences of the string "nan" from a column in a DataFrame."""
    df[column] = df[column].apply(lambda x: x.replace("nan", "") if isinstance(x, str) else x)
    return df

In [None]:
def remove_special_cases(df, column):
    """Remove all occurrences of certain strings from a column in a DataFrame."""
    df[column] = df[column].apply(lambda x: x.replace("%20", " "))
    df[column] = df[column].apply(lambda x: x.replace("&amp;", " "))
    df[column] = df[column].apply(lambda x: x.replace("\n", " "))
    df[column] = df[column].apply(lambda x: x.replace("@", ""))
    df[column] = df[column].apply(lambda x: x.replace("#", ""))
    df[column] = df[column].apply(lambda x: x.replace("Ûª", "'"))
    df[column] = df[column].apply(lambda x: x.replace("Û÷", "'"))
    df[column] = df[column].apply(lambda x: x.replace("&lt;", ""))
    df[column] = df[column].apply(lambda x: x.replace("&gt", ""))
    df[column] = df[column].apply(lambda x: x.replace("[KB][N][KE]", ""))
    df[column] = df[column].apply(lambda x: x.replace("[LB][N][LE]", ""))
    return df

In [None]:
def clean_df(df):
    df = df.fillna('[N]')
    df['input'] = "[CLS]" + df.text
    df = wrap_mentions(df, 'text')
    df = wrap_mentions(df, 'input')
    df = wrap_hashtags(df, 'text')
    df = wrap_hashtags(df, 'input')
    df = remove_special_cases(df, 'input')
    df = remove_special_cases(df, 'text')
    df = remove_non_ascii(df, 'input')
    df = replace_url(df, 'text')
    df = replace_url(df, 'input')
    return df

In [None]:
train = clean_df(train)

In [None]:
ds = Dataset.from_pandas(train)
ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input'],
    num_rows: 7613
})

In [None]:
def tok_func(x): return tokenizer(x["input"])

In [None]:
#Parallel processing
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

In [None]:
row = tok_ds[0]
row['input'], row['input_ids']

('[CLS]Our Deeds are the Reason of this [HB]earthquake[HE] May ALLAH Forgive us all',
 [1,
  1,
  581,
  65453,
  281,
  262,
  18037,
  265,
  291,
  128004,
  10612,
  128005,
  903,
  4924,
  17018,
  43632,
  381,
  305,
  2])

In [None]:
tok_ds = tok_ds.rename_columns({'target':'labels'})

In [None]:
#Creating training and validation set
#dds = DictionaryDataset
dds = tok_ds.train_test_split(test_size=0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5709
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1904
    })
})

Model training

In [None]:
preds_cb = np.arange(1)
labels_cb = np.arange(1)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("f1")
    preds, labels = eval_preds
    if len(preds) > 0:
        global preds_cb
        global labels_cb
        preds_cb = np.copy(preds)
        labels_cb = np.copy(labels)   
    return metric.compute(predictions=preds, references=labels)

In [None]:
#Parameters for training
lr = 5e-6
bs = 4 
gradient_accumulation_steps = 1
epochs = 2
bs2 = 4
gradient_accumulation_steps2 = 1
epochs2 = 1

In [None]:
def train_model(dataset, tokenizer, seed=42, lr=5e-6, epochs=2, bs=4,
                model_name='microsoft/deberta-v3-large', gradient_accumulation_steps=1,
                reshuffle=0, epochs2=1, bs2=4, gradient_accumulation_steps2=1, fp16=True):
    
    dds = dataset.train_test_split(0.25, seed=seed)
    
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1,
                             lr_scheduler_type='cosine', fp16=True,
                             evaluation_strategy="epoch", per_device_train_batch_size=bs,
                             per_device_eval_batch_size=bs*2, num_train_epochs=epochs,
                             weight_decay=0.01, report_to='none',
                             gradient_accumulation_steps=gradient_accumulation_steps,
                             save_steps=-1)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
    
    trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)
    
    trainer.train()
    model = trainer.model 
    dds = tok_ds.train_test_split(0.25, seed=42)
    epochs=epochs2
    args = TrainingArguments('outputs', learning_rate=lr/2, warmup_ratio=0.1,
                                 lr_scheduler_type='cosine', fp16=True,
                                 evaluation_strategy="epoch", per_device_train_batch_size=bs2,
                                 per_device_eval_batch_size=bs2*2, num_train_epochs=epochs2,
                                 weight_decay=0.01, report_to='none',
                                 gradient_accumulation_steps=gradient_accumulation_steps2,
                                 save_steps=-1)
    trainer = Trainer(model, args, train_dataset=dataset, eval_dataset=dds['test'],
                      tokenizer=tokenizer, compute_metrics=compute_metrics)
    trainer.train()
    return trainer

In [None]:
trainer = train_model(dataset=tok_ds, tokenizer=tokenizer, lr=lr, seed=42,
                          epochs=epochs, bs=bs, model_name=model_name,
                          gradient_accumulation_steps=gradient_accumulation_steps,
                          reshuffle=reshuffle, epochs2=epochs2, bs2=bs2,
                          gradient_accumulation_steps2=gradient_accumulation_steps2, fp16=True)

Epoch,Training Loss,Validation Loss,F1
1,0.1554,0.141534,0.58383
2,0.1226,0.146558,0.749638


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,F1
1,0.1184,0.123141,0.770563


In [None]:
trainer.evaluate()

{'eval_loss': 0.12314070016145706,
 'eval_f1': 0.7705627705627704,
 'eval_runtime': 17.9725,
 'eval_samples_per_second': 105.94,
 'eval_steps_per_second': 13.242,
 'epoch': 1.0}

In [None]:
os.chdir(datadir)
eval_df = pd.read_csv('test.csv')
eval_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
eval_df = clean_df(eval_df)

In [None]:
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [None]:
predictions = torch.empty((0,len(eval_df)))
predictions

tensor([], size=(0, 3263))

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
prediction = torch.tensor(preds)
predictions = torch.cat([predictions,prediction.T],dim=0)

In [None]:
preds = predictions.numpy()

In [None]:
preds2 = [ 1 if element > 0.6 else 0 for element in preds.squeeze()]
preds = preds2

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'target': preds
})
submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

22746

In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Accuracy: 84.49%