In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("train.tsv",delimiter="\t")
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0


In [3]:
dataset.isnull().sum()

Unnamed: 0    0
title         0
text          0
subject       0
date          0
label         0
dtype: int64

In [4]:
dataset.drop("Unnamed: 0", axis=1,inplace=False)

Unnamed: 0,title,text,subject,date,label
0,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1
1,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0
2,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1
3,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0
4,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0
...,...,...,...,...,...
29995,U.S. aerospace industry urges Trump to help Ex...,The chief executive of the U.S. Aerospace Indu...,politicsNews,"December 6, 2016",1
29996,Highlights: Hong Kong leader Carrie Lam delive...,The following are highlights of the maiden pol...,worldnews,"October 11, 2017",1
29997,Obama Literally LAUGHS At Claims That Brexit M...,If there s one thing President Barack Obama is...,News,"June 28, 2016",0
29998,Syrian army takes full control of Deir al-Zor ...,The Syrian army and its allies have taken full...,worldnews,"November 2, 2017",1


In [5]:
dataset["article"] = dataset["title"] + " " + dataset["text"]
dataset = dataset[["article","subject","date","label"]]
dataset.head()

Unnamed: 0,article,subject,date,label
0,Ex-CIA head says Trump remarks on Russia inter...,politicsNews,"July 22, 2017",1
1,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,Government News,"Jun 19, 2017",0
2,Federal Reserve governor Powell's policy views...,politicsNews,"November 2, 2017",1
3,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,left-news,"Sep 17, 2016",0
4,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,politics,"May 26, 2017",0


In [6]:
dataset.label.value_counts()

label
0    15478
1    14522
Name: count, dtype: int64

In [7]:
from transformers import AutoTokenizer
from transformers import BertTokenizer, BertModel
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [8]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [9]:
def tokenize(X):  
    X = bert_tokenizer(
        text=list(X["article"]),  
        add_special_tokens=True,
        max_length=100,
        truncation=True,
        padding="max_length",  
        return_tensors="pt",
        return_token_type_ids=False,
        return_attention_mask=True,
        verbose=True
    )
    return X

In [10]:
from datasets import Dataset  
hf_dataset = Dataset.from_pandas(dataset)  
hf_dataset = hf_dataset.map(lambda x: tokenize(x), batched=True) 

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [None]:
hf_dataset = hf_dataset.rename_columns({"label": "labels"})  
hf_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])  

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",  
    num_labels=len(dataset["label"].unique()),  
    torch_dtype="auto"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./test_trainer",  
    evaluation_strategy="epoch", 
    per_device_train_batch_size=8,  
    num_train_epochs=3, 
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",  
)



In [49]:
!pip install accelerate>=0.26.0

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the predicted class
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    accuracy = accuracy_score(labels, predictions)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [20]:
eval_dataset = pd.read_csv("test.tsv",delimiter="\t")
eval_dataset.drop("Unnamed: 0", axis=1,inplace=False)
eval_dataset["article"] = eval_dataset["title"] + " " + eval_dataset["text"]
eval_dataset = eval_dataset[["article","subject","date","label"]]
eval_dataset.head()

Unnamed: 0,article,subject,date,label
0,Conservatives Will HATE What Donald Trump Just...,News,"February 14, 2016",0
1,Trump victory may create new tension between U...,politicsNews,"November 9, 2016",1
2,WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...,politics,"Nov 9, 2017",0
3,"Democratic Senator Franken to resign: CNN, cit...",politicsNews,"December 7, 2017",1
4,GANG OF DOMESTIC TERRORISTS Violently Attack L...,left-news,"Jan 21, 2017",0


In [21]:
from datasets import Dataset  
hf_evdataset = Dataset.from_pandas(eval_dataset)  
hf_evdataset = hf_evdataset.map(lambda x: tokenize(x), batched=True) 
hf_evdataset = hf_evdataset.rename_columns({"label": "labels"})  
hf_evdataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])  

Map:   0%|          | 0/8267 [00:00<?, ? examples/s]

In [22]:
from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset,
    eval_dataset=hf_evdataset,
    compute_metrics=compute_metrics, 
    ) 

In [23]:
trainer.train()

  0%|          | 0/11250 [00:00<?, ?it/s]

{'loss': 0.1656, 'grad_norm': 0.026309076696634293, 'learning_rate': 4.7777777777777784e-05, 'epoch': 0.13}
{'loss': 0.0706, 'grad_norm': 0.05631306394934654, 'learning_rate': 4.555555555555556e-05, 'epoch': 0.27}
{'loss': 0.0652, 'grad_norm': 9.112504959106445, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 0.0384, 'grad_norm': 0.0035010678693652153, 'learning_rate': 4.111111111111111e-05, 'epoch': 0.53}
{'loss': 0.0399, 'grad_norm': 0.10961001366376877, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.67}
{'loss': 0.0389, 'grad_norm': 0.2363947182893753, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}
{'loss': 0.0263, 'grad_norm': 0.0013656800147145987, 'learning_rate': 3.444444444444445e-05, 'epoch': 0.93}


  0%|          | 0/1034 [00:00<?, ?it/s]

{'eval_loss': 0.017517559230327606, 'eval_accuracy': 0.9968549655255836, 'eval_precision': 0.9968576004644056, 'eval_recall': 0.9968549655255836, 'eval_f1': 0.9968548222061829, 'eval_runtime': 97.7534, 'eval_samples_per_second': 84.57, 'eval_steps_per_second': 10.578, 'epoch': 1.0}
{'loss': 0.0173, 'grad_norm': 0.0010265439050272107, 'learning_rate': 3.222222222222223e-05, 'epoch': 1.07}
{'loss': 0.002, 'grad_norm': 0.00013034077710472047, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 0.0129, 'grad_norm': 0.0018639473710209131, 'learning_rate': 2.777777777777778e-05, 'epoch': 1.33}
{'loss': 0.0121, 'grad_norm': 0.000131679029436782, 'learning_rate': 2.5555555555555554e-05, 'epoch': 1.47}
{'loss': 0.0084, 'grad_norm': 0.00021303861285559833, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}
{'loss': 0.0218, 'grad_norm': 0.0015556636499240994, 'learning_rate': 2.111111111111111e-05, 'epoch': 1.73}
{'loss': 0.0075, 'grad_norm': 0.00014337858010549098, 'learning_rate': 1.888888888888

  0%|          | 0/1034 [00:00<?, ?it/s]

{'eval_loss': 0.014223740436136723, 'eval_accuracy': 0.997701705576388, 'eval_precision': 0.9977042502492396, 'eval_recall': 0.997701705576388, 'eval_f1': 0.997701794062947, 'eval_runtime': 65.4515, 'eval_samples_per_second': 126.307, 'eval_steps_per_second': 15.798, 'epoch': 2.0}
{'loss': 0.0031, 'grad_norm': 0.0005904629360884428, 'learning_rate': 1.4444444444444444e-05, 'epoch': 2.13}
{'loss': 0.0054, 'grad_norm': 0.0015232727164402604, 'learning_rate': 1.2222222222222222e-05, 'epoch': 2.27}
{'loss': 0.0035, 'grad_norm': 0.00015362004342023283, 'learning_rate': 1e-05, 'epoch': 2.4}
{'loss': 0.0002, 'grad_norm': 6.982347986195236e-05, 'learning_rate': 7.777777777777777e-06, 'epoch': 2.53}
{'loss': 0.0, 'grad_norm': 6.561769259860739e-05, 'learning_rate': 5.555555555555556e-06, 'epoch': 2.67}
{'loss': 0.0018, 'grad_norm': 0.00011300406185910106, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}
{'loss': 0.0, 'grad_norm': 5.039016105001792e-05, 'learning_rate': 1.1111111111111112e

  0%|          | 0/1034 [00:00<?, ?it/s]

{'eval_loss': 0.013733165338635445, 'eval_accuracy': 0.9983065198983911, 'eval_precision': 0.9983076607312901, 'eval_recall': 0.9983065198983911, 'eval_f1': 0.9983065638131854, 'eval_runtime': 49.5176, 'eval_samples_per_second': 166.951, 'eval_steps_per_second': 20.881, 'epoch': 3.0}
{'train_runtime': 3222.4872, 'train_samples_per_second': 27.929, 'train_steps_per_second': 3.491, 'train_loss': 0.02428512336804221, 'epoch': 3.0}


TrainOutput(global_step=11250, training_loss=0.02428512336804221, metrics={'train_runtime': 3222.4872, 'train_samples_per_second': 27.929, 'train_steps_per_second': 3.491, 'total_flos': 4624999020000000.0, 'train_loss': 0.02428512336804221, 'epoch': 3.0})

In [34]:
from datasets import Dataset

row_index = 4 
row_text = eval_dataset.iloc[row_index]["article"]


row_data = {"article": [row_text]}
row_dataset = Dataset.from_dict(row_data)


tokenized_row = row_dataset.map(lambda x: tokenizer(x["article"], truncation=True, padding="max_length"), batched=True)


predictions = trainer.predict(tokenized_row)
predicted_logits = predictions.predictions
predicted_labels = predicted_logits.argmax(axis=-1)


label_map = {0: "0", 1: "1"}  
predicted_label = label_map[predicted_labels[0]]

print(f"Text: {row_text}")
print(f"Predicted Label: {predicted_label}")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Predicted Label: 0


In [30]:
eval_dataset

Unnamed: 0,article,subject,date,label
0,Conservatives Will HATE What Donald Trump Just...,News,"February 14, 2016",0
1,Trump victory may create new tension between U...,politicsNews,"November 9, 2016",1
2,WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...,politics,"Nov 9, 2017",0
3,"Democratic Senator Franken to resign: CNN, cit...",politicsNews,"December 7, 2017",1
4,GANG OF DOMESTIC TERRORISTS Violently Attack L...,left-news,"Jan 21, 2017",0
...,...,...,...,...
8262,Russian MP says Flynn was forced to resign to ...,politicsNews,"February 14, 2017",1
8263,Highlights: The Trump presidency on March 7 at...,politicsNews,"March 7, 2017",1
8264,SHOCKER! WAS MUSLIM TERRORIST GAY? Used Gay Da...,left-news,"Jun 13, 2016",0
8265,John McCain and The Cancer of Conflict Patrick...,US_News,"July 21, 2017",0
