In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')

In [None]:
data=data[['v1','v2']]

In [None]:
data.rename(columns={"v1": "label", "v2": "msg"},inplace=True)

In [None]:
data.head()

In [None]:
texts=data['msg'].tolist()
labels=data['label'].tolist()

In [None]:
labels=[ 0 if i=="ham" else 1 for i in labels]

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.3,stratify=labels)

In [None]:
test_texts, val_texts, test_labels, val_labels = train_test_split(val_texts, val_labels, test_size=.5,stratify=val_labels)

In [None]:
train_size=len(train_texts)
val_size=len(val_texts)
test_size=len(test_texts)

In [None]:
!pip install transformers

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class SMSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SMSDataset(train_encodings, train_labels)
val_dataset = SMSDataset(val_encodings, val_labels)
test_dataset = SMSDataset(test_encodings, test_labels)

In [None]:
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_dataloader= DataLoader(val_dataset, batch_size=16, shuffle=True)


optim = AdamW(model.parameters(), lr=5e-5)

for epoch in tqdm(range(5)):
    model.train()
    running_loss = 0.0
    correct = 0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        
        running_loss += loss.item()
        predictions = outputs.logits.argmax(-1)
        correct += (predictions == labels).float().sum()
        
        
    print("Loss:", running_loss / batch["input_ids"].shape[0])
    accuracy = 100 * correct / train_size
    print("Training accuracy:", accuracy.item())
    
    
    
    model.eval()

    correct = 0
    for batch in validation_dataloader:
  
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        
        running_loss += loss.item()
        predictions = outputs.logits.argmax(-1)
        correct += (predictions == labels).float().sum()
        
        
    print("Loss:", running_loss / batch["input_ids"].shape[0])
    accuracy = 100 * correct / val_size
    print("validation accuracy:", accuracy.item())
    
    

In [None]:
test_dataloader= DataLoader(test_dataset, batch_size=16, shuffle=True)
correct = 0
model.eval()

for batch in test_dataloader:

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]

    running_loss += loss.item()
    predictions = outputs.logits.argmax(-1)
    correct += (predictions == labels).float().sum()


print("Loss:", running_loss / batch["input_ids"].shape[0])
accuracy = 100 * correct / val_size
print("test accuracy:", accuracy.item())