In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# IT MEET 2022: Nepali News Classifier using HuggingFace

## Import required datasets

In [None]:
initial_dataset = pd.read_csv("../input/text-it-meet-22/train.csv")
initial_dataset

## Preprocessing dataset

### De-duplication

In [None]:
before_de_duplicate_shape = initial_dataset.shape
print(f"The dataset shape before de-duplication was {before_de_duplicate_shape}")

# de-duplication
initial_dataset.drop_duplicates(inplace=True)

after_de_duplicate_shape = initial_dataset.shape
print(f"The dataset shape after de-duplication was {after_de_duplicate_shape}")
print(f"The total number of rows that were removed were {before_de_duplicate_shape[0] - after_de_duplicate_shape[0]}")


### Drop NaN values

In [None]:
before_drop_na_shape = initial_dataset.shape
print(f"The dataset shape before de-duplication was {before_drop_na_shape}")

# dropping NaN values
initial_dataset.dropna(inplace=True)

after_drop_na_shapee = initial_dataset.shape
print(f"The dataset shape after de-duplication was {after_drop_na_shapee}")
print(f"The total number of rows that were removed were {before_drop_na_shape[0] - after_drop_na_shapee[0]}")


### Label Encoding

In [None]:
# total number of labels
len(initial_dataset["label"].value_counts())

In [None]:
# Label Encoding

# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
initial_dataset['label']= label_encoder.fit_transform(initial_dataset['label'])

In [None]:
# preview of our encoded labels
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
le_name_mapping

### Train-validation split

In [None]:
# import modules
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
 
# split the dataset
train_dataset, validation_dataset = train_test_split(
    initial_dataset, test_size=0.1, random_state=42)

In [None]:
len(train_dataset)

### Saving it to csv

In [None]:
import csv

# here, random sample of 33K is due to the memory constraint faced using training in Kaggle 
train_dataset.sample(33000).to_csv("./preprocessed_train.csv", index=False, quoting=csv.QUOTE_ALL)
validation_dataset.to_csv("./preprocessed_val.csv", index=False, quoting=csv.QUOTE_ALL)

## Prepare a dataset for HuggingFace DistilBERT model


In [None]:
from datasets import load_dataset

data_files = {"train": "./preprocessed_train.csv", "eval": "./preprocessed_val.csv"}
dataset = load_dataset('csv', data_files=data_files)

In [None]:
# preview of our processed dataset
dataset

In [None]:
# preview of the format of our validation dataset
preview_dataset = pd.DataFrame(dataset["eval"])
preview_dataset

### Importing and downloading Tokenizer and Models from HuggingFace

I have used Suyogyart/nepali-16-newsgroups-classification from huggingface.co which can be found here:
- Suyogyart/nepali-16-newsgroups-classification -> https://huggingface.co/Suyogyart/nepali-16-newsgroups-classification

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Suyogyart/nepali-16-newsgroups-classification")


def tokenize_function(examples):
    return tokenizer(examples["data"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# preparing training and validation dataset

train_dataset = tokenized_datasets["train"].shuffle(seed=0)
eval_dataset = tokenized_datasets["eval"].shuffle(seed=0)

## Train

In [None]:
from transformers import AutoModelForSequenceClassification

# downloading model
model = AutoModelForSequenceClassification.from_pretrained("Suyogyart/nepali-16-newsgroups-classification", num_labels=len(initial_dataset["label"].value_counts()), ignore_mismatched_sizes=True)

### Training hyperparameters

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=None, evaluation_strategy="epoch")

### Metrics

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

### Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# training our model
trainer.train()

## Saving our model

In [None]:
save_directory = "./model-aug13-news-classifier-iter-4-33K"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)


## Ziping our model

In [None]:
!zip -r model-aug13-news-classifier-iter-4-33K.zip model-aug13-news-classifier-iter-4-33K

## Pushing it into AWS s3 bucket 

In [None]:
import boto3

AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''

s3 = boto3.resource(service_name = 's3', aws_access_key_id= AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3.meta.client.upload_file(Filename = './model-aug13-news-classifier-iter-4-33K.zip', Bucket = "realpha-models-registry", Key = 'model-aug13-news-classifier-iter-4-33K.zip')