# Data processing and training 

##### Installing libraries and importing packages

In [6]:
!pip install accelerate==0.17.0
!pip install transformers==4.28.0
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate==0.17.0
  Downloading accelerate-0.17.0-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.8/212.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.17.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (f

In [7]:
# install sentencepiece
!pip install "sentencepiece==0.1.96"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece==0.1.96
  Downloading sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [9]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import pandas as pd
import tensorflow as tf
import sentencepiece

In [11]:
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from sklearn.metrics import mean_squared_error

In [12]:
import evaluate

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/Colab Notebooks/yt_analysis

/content/drive/MyDrive/Colab Notebooks/yt_analysis


In [None]:
!pwd

/content/drive/MyDrive/Colab Notebooks/yt_analysis


In [None]:
data = pd.read_csv('data_views.csv')

In [None]:
data.head()

Unnamed: 0,title,views,tags,description,view_group
0,youtube channel 202004202304,96,0,,"(79.0, 186.0]"
1,100 youtube channel 202304,66,mongolian youtube channel,100 youtube channel,"(-0.1, 79.0]"
2,mongolian youtube channel top5,318,mongolian youtube channel,youtube,"(306.0, 455.0]"
3,2x2,2248,,2x2,"(2224.3, 3524.0]"
4,top 25,281,mongol tik tok mongolia tik tok,top list mongolia 3 tik tok mongolia,"(186.0, 306.0]"


In [None]:
data = data.rename(columns={'title':'text', 'views':'labels'})

In [None]:
data = data[['text', 'labels']]

In [None]:
data.dropna(inplace=True)

In [None]:
data.head()

Unnamed: 0,text,labels
0,youtube channel 202004202304,96
1,100 youtube channel 202304,66
2,mongolian youtube channel top5,318
3,2x2,2248
4,top 25,281


In [None]:
data['labels'] = data['labels'].astype(float)

# Splitting dataset

In [None]:
dataset = Dataset.from_pandas(data, preserve_index=False)

In [None]:
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 85772
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 36760
    })
})

## Loading pretrained model 'distilbert-base-uncase' and tokenizer

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122532 entries, 0 to 127225
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   text    122532 non-null  object 
 1   labels  122532 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.8+ MB


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/85772 [00:00<?, ? examples/s]

Map:   0%|          | 0/36760 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [None]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse":rmse}

In [None]:
training_args = TrainingArguments(output_dir="test_trainer", 
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch", 
                                  per_device_train_batch_size = 16,
                                  per_device_eval_batch_size = 16,
                                  num_train_epochs=3, 
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end = False)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= tokenized_datasets["train"],
    eval_dataset= tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()




Epoch,Training Loss,Validation Loss,Rmse
1,9497593932266.129,22405336530944.0,4733427.5
2,9492860735701.166,22399516934144.0,4732813.0


Epoch,Training Loss,Validation Loss,Rmse
1,9497593932266.129,22405336530944.0,4733427.5
2,9492860735701.166,22399516934144.0,4732813.0
3,9489265973855.027,22396754984960.0,4732521.0


TrainOutput(global_step=16083, training_loss=9493240213940.773, metrics={'train_runtime': 13122.5112, 'train_samples_per_second': 19.609, 'train_steps_per_second': 1.226, 'total_flos': 3.408537327748301e+16, 'train_loss': 9493240213940.773, 'epoch': 3.0})

### Save model

In [26]:
# save the model/tokenizer
model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")
# load the model/tokenizer
from transformers import AutoModelForTokenClassification
model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [15]:
from transformers import AutoModelForTokenClassification
model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [22]:
from transformers import Trainer
trainer = Trainer(model=pickled_model)
def tokenize_function(examples):
    return tokenizer_pkl(examples["text"], padding="max_length", truncation=True) 
def pipeline_prediction(text):
    df=pd.DataFrame({'text':[text]})
    dataset = Dataset.from_pandas(df,preserve_index=False) 
    tokenized_datasets = dataset.map(tokenize_function)
    raw_pred, _, _ = trainer.predict(tokenized_datasets) 
    return(raw_pred[0][0])

In [23]:
pipeline_prediction("How to make cheeseburger")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

16844.059

In [None]:
pickle.dump(model, open('model.pkl', 'wb'))
# saving
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer_pkl = pickle.load(handle)

In [None]:
pickled_model = pickle.load(open('model.pkl', 'rb'))