# A Data Driven YouTube Title Ranking System For Content Creators

# Importing essential libraries and data

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np
import torch

data=pd.read_csv("/content/drive/MyDrive/Data for stats and DM/Youtube_Data_New.csv")

# Text Embedding

In [None]:
import numpy as np
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Code to encode text using BERT model
def encode_text_with_bert(text, model='bert-base-uncased'):
    tokenizer = BertTokenizer.from_pretrained(model)
    model_1 = BertModel.from_pretrained(model)
    t = tokenizer(text, return_tensors='pt')
    val = model_1(**t)
    embeddings = val.last_hidden_state.detach().numpy()
    return embeddings


Embeddings = []

for title in tqdm(data['Title'], desc='Processing Titles', unit='title'):
    embeddings = encode_text_with_bert(title)
    Embeddings.append(embeddings)

Embeddings = np.array(Embeddings)

In [None]:
from tqdm import tqdm
Embeddings = []
for title in tqdm(data['Title'], desc='Processing Titles', unit='title'):
    embeddings = encode_text_with_bert(title)
    Embeddings.append(embeddings)

# Regression model

## Normalization

In [None]:
'''
# Z scaled
scaler = StandardScaler()
k=np.array(data['Views'])
views_new = k.reshape(-1, 1)
views_z_score = scaler.fit_transform(views_new).flatten()

# Min-Max normalization
def min_max_scaling(val):
    min_val = np.min(val)
    max_val = np.max(val)
    scaled_data = (val-min_val)/(max_val-min_val)
    return scaled_data

views_minmax_score = min_max_scaling(data['Views'])
'''

## Regression Experiment with Z scale

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

#Setting decive to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data = pd.read_csv("Youtube_Data_New.csv")
tokenizer = BertTokenizer.from_pretrained("bert-large-cased")
titles = list(data['Title'])

# Z scaled
scaler = StandardScaler()
k=np.array(data['Views'])
views_new = k.reshape(-1, 1)
views_z_score = scaler.fit_transform(views_new).flatten()

#text encoder
encoding = tokenizer(titles, return_tensors="pt", padding=True, truncation=True)
labels = torch.tensor(views_z_score, device=device).view(-1, 1)

data = TensorDataset(encoding["input_ids"].to(device), encoding["attention_mask"].to(device), labels)

train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_data, val_data = random_split(data, [train_size, val_size])

# DataLoader to form train and validation datasets
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=32, shuffle=False)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
model = model.to(device)  # Move the model to GPU

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(10):  # Number of training epochs
    print("EPOCH: ",epoch)
    for batch in train_dataloader:
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)
        masks = masks.to(torch.float32)
        labels = labels.to(torch.float32)
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()
val_los = []

with torch.no_grad():
    for batch in val_dataloader:
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks, labels=labels)
        val_los.append(outputs.loss.item())

mean_val_loss = sum(val_los) / len(val_los)
print(f'Mean Validation Loss: {mean_val_loss}')


model.eval()
# Example: Predictions for the first batch in the validation set
with torch.no_grad():
    inputs, masks, _ = next(iter(val_dataloader))
    predictions = model(inputs, attention_mask=masks).logits

print("Predictions:", predictions)

## Regression Experiment with Min-Max scalled

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

#Setting decive to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data = pd.read_csv("Youtube_Data_New.csv")
tokenizer = BertTokenizer.from_pretrained("bert-large-cased")
titles = list(data['Title'])

# Min-max scaled
def min_max_scaling(val):
    min_val = np.min(val)
    max_val = np.max(val)
    scaled_data = (val-min_val)/(max_val-min_val)
    return scaled_data

views_minmax_score = min_max_scaling(data['Views'])

#text encoder
encoding = tokenizer(titles, return_tensors="pt", padding=True, truncation=True)
labels = torch.tensor(views_minmax_score, device=device).view(-1, 1)

data = TensorDataset(encoding["input_ids"].to(device), encoding["attention_mask"].to(device), labels)

train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_data, val_data = random_split(data, [train_size, val_size])

# DataLoader to form train and validation datasets
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=32, shuffle=False)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
model = model.to(device)  # Move the model to GPU

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(10):  # Number of training epochs
    print("EPOCH: ",epoch)
    for batch in train_dataloader:
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)
        masks = masks.to(torch.float32)
        labels = labels.to(torch.float32)
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()
val_los = []

with torch.no_grad():
    for batch in val_dataloader:
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks, labels=labels)
        val_los.append(outputs.loss.item())

mean_val_loss = sum(val_los) / len(val_los)
print(f'Mean Validation Loss: {mean_val_loss}')


model.eval()
# Example: Predictions for the first batch in the validation set
with torch.no_grad():
    inputs, masks, _ = next(iter(val_dataloader))
    predictions = model(inputs, attention_mask=masks).logits

print("Predictions:", predictions)

# BERT for classification

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.optim.lr_scheduler import StepLR

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Pre-processing data for classification
data = pd.read_csv("/content/drive/MyDrive/Data for stats and DM/Youtube_Data_New.csv")
bins = [0, 1000, 10000, 100000, 500000]  # Adjusted bins
labels = ['Week title (0-1000 views)', 'Could be better (1000-10000 views)', 'Good title (10000-100000 views)', 'Very strong title (100000-500000 views)']
data['views_category'] = pd.cut(data['Views'], bins=bins, labels=labels, include_lowest=True)


tokenizer = BertTokenizer.from_pretrained("bert-large-cased")

# Inputs
texts = list(data['Title'])
encoding = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

# Outputs
views_categories = data['views_category']
label_encoder = LabelEncoder()
encoded_data = label_encoder.fit_transform(views_categories)
labels = torch.tensor(encoded_data, device=device).view(-1)

data = TensorDataset(encoding["input_ids"].to(device), encoding["attention_mask"].to(device), labels)

train_size = int(0.9*len(data))
val_size = len(data)-train_size
train_dataset, val_dataset = random_split(data, [train_size, val_size])

# DataLoader without pin_memory when working with GPU tensors
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Data_mining_model", num_labels=len(labels.unique()))
model = model.to(device)  # Move the model to GPU

optimizer = AdamW(model.parameters(), lr=5e-5)

cross_entropy = torch.nn.CrossEntropyLoss()

# Lists to store training and validation scores for each epoch
train_accuracies = []
train_precisions = []
train_recalls = []
train_f1_scores = []
val_accuracies = []
val_precisions = []
val_recalls = []
val_f1_scores = []

for epoch in range(5):  # Number of training epochs
    print("EPOCH: ", epoch)
    model.train()
    train_losses = []

    true_train = []
    predicted_train = []

    for batch in train_dataloader:
        inputs, masks, targets = batch
        inputs, masks, targets = inputs.to(device), masks.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks)
        logits = outputs.logits
        loss = cross_entropy(logits, targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

        true_train.extend(targets.cpu().numpy())

        # Computing predicted labels for the current batch
        probabilities_train = torch.nn.functional.softmax(logits, dim=1)
        _, predicted_classes_train = torch.max(probabilities_train, dim=1)
        predicted_train.extend(predicted_classes_train.cpu().numpy())

    train_accuracy = accuracy_score(true_train, predicted_train)
    train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(true_train, predicted_train, average='weighted')
    train_accuracies.append(train_accuracy)
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1_scores.append(train_f1)

    model.eval()
    val_losses = []
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            inputs, masks, labels = batch
            outputs = model(inputs, attention_mask=masks)
            logits = outputs.logits
            probabilities = torch.nn.functional.softmax(logits, dim=1)
            _, predicted_classes = torch.max(probabilities, dim=1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted_classes.cpu().numpy())

    val_accuracy = accuracy_score(true_labels, predicted_labels)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')
    val_accuracies.append(val_accuracy)
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1_scores.append(val_f1)

    print(f'Training Loss for epoch {epoch + 1} is: {sum(train_losses) / len(train_losses):.4f}')
    print(f'Training Accuracy for epoch {epoch + 1} is: {train_accuracy:.4f}')
    print(f'Training Precision for epoch {epoch + 1} is: {train_precision:.4f}')
    print(f'Training Recall for epoch {epoch + 1} is: {train_recall:.4f}')
    print(f'Training F1 Score for epoch {epoch + 1} is: {train_f1:.4f}')

    print(f'Validation Accuracy for epoch {epoch + 1} is: {val_accuracy:.4f}')
    print(f'Validation Precision for epoch {epoch + 1} is: {val_precision:.4f}')
    print(f'Validation Recall for epoch {epoch + 1} is: {val_recall:.4f}')
    print(f'Validation F1 Score for epoch {epoch + 1} is: {val_f1:.4f}')
