<a href="https://colab.research.google.com/github/shivendrra/SmallLanguageModel-project/blob/main/FullScaleSLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Full Scale Small Language Model
by @shivendrra

In [2]:
from google.colab import files
channelData = files.upload()

In [3]:
channel_Id_Json  = [
  "UCA19mAJURyYHbJzhfpqhpCA",
  "UCsXVk37bltHxD1rDPwtNM8Q",
  "UCRcgy6GzDeccI7dkbbBna3Q",
  "UCmGSJVG3mCRXVOP4yZrU1Dw",
  "UC415bOPUcGSamy543abLmRA",
  "UCb_MAhL8Thb3HJ_wPkH3gcw",
  "UC9RM-iSvTu1uPJb8X5yp3EQ",
  "UCR1IuLEqb6UEA_zQ81kwXfg",
  "UCYO_jab_esuFRV4b17AJtAw",
  "UCA295QVkf9O1RQ8_-s3FVXg",
  "UCqVEHtQoXHmUCfJ-9smpTSg",
  "UC4QZ_LsYcvcq7qOsOhpAX4A",
  "UCLXo7UDZvByw2ixzpQCufnA"
]

In [4]:
!pip install python-dotenv
!pip install youtube-transcript-api



In [5]:
import json
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv('yt_secret_key')

In [6]:
from googleapiclient.discovery import build
from youtube_transcript_api import TranscriptsDisabled, YouTubeTranscriptApi
import logging

logging.basicConfig(filename='youtube_fetch.log', level=logging.ERROR)
youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
import timeit

start_time = timeit.default_timer()

videoNo = 0
for links in channel_Id_Json:
  next_page_token = None
  videoIds = []

  while True:
    channelRes = youtube.channels().list(
      part='contentDetails', id=links
    ).execute()

    if 'items' in channelRes and channelRes['items']:
      playlistId = channelRes['items'][0]['contentDetails']['relatedPlaylists']['uploads']

      playlistResult = youtube.playlistItems().list(
        part='contentDetails', playlistId=playlistId,
        maxResults = 100, pageToken = next_page_token
      ).execute()

      videoIds.extend([item['contentDetails']['videoId'] for item in playlistResult.get('items', [])])

      next_page_token = playlistResult.get('nextPageToken')

      if not next_page_token:
        break

  for ids in videoIds:
    videoUrl = f"https://www.youtube.com/watch?v={ids}"
    try:
      raw_transcripts = []
      try:
        captions = YouTubeTranscriptApi.get_transcript(
          ids, languages=['en'], preserve_formatting=True
        )
        if captions:
          formatted_captions = [{'text': caption['text']} for caption in captions]
          raw_transcripts.append(formatted_captions)
          videoNo += 1
          print(f"Number of videos with valid captions are: {videoNo}")
        else:
          continue
      except TranscriptsDisabled as e:
        print(F"There was an error while getting the captions: {e}")
      except Exception as e:
        logging.error(f"There was some error while fetching the video: {str(e)}")
    except Exception as e:
      logging.error(f"There was some error while getting the captions: {str(e)}")

    with open('training_data.txt', 'a', encoding='utf-8') as file:
      for videoCaptions in raw_transcripts:
        for line in videoCaptions:
          file.write(line['text'] + ' ')

print(f"time taken to execute the code is {timeit.default_timer() - start_time} mins")

Number of videos with valid captions are: 1653
Number of videos with valid captions are: 1654
Number of videos with valid captions are: 1655


In [None]:
with open('training_data.txt', 'r', encoding='utf-8') as file:
  captions = file.read()

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, PorterStemmer
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [None]:
tokens = nltk.word_tokenize(captions)
lm = WordNetLemmatizer()
lemmatized_tokens = [lm.lemmatize(token.lower()) for token in tokens if token.isalpha()]

In [None]:
# Convert lemmatized tokens back to text
lemmatized_text = ' '.join(lemmatized_tokens)

In [None]:
# Applying tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([lemmatized_text]).toarray()

In [None]:
# converting the vectors to .csv and then saving it
import pandas as pd

vector_array = pd.DataFrame(tfidf_matrix)
vector_array.to_csv('vector_data.csv')

print('data written to .csv file successfully!!')
print(f"Data vectorized in : {timeit.default_timer() - start_time} mins")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import time

start_time = time.time()
# Define CustomDataset
class CustomDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = {'input': torch.FloatTensor(self.data.iloc[idx, :-1].values),
                  'target': torch.FloatTensor([self.data.iloc[idx, -1]])}
        return sample

# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(self, input_size, output_size, d_model=64, nhead=2, num_layers=2):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, output_size)

    def forward(self, x):
        x = self.embedding(x)

        # Reshape x to have shape (sequence_length, batch_size, d_model)
        x = x.unsqueeze(0)  # Add a sequence dimension at the beginning

        # Apply transformer
        x = self.transformer(x, x)

        # Remove the sequence dimension
        x = x.squeeze(0)

        # Average over the sequence dimension
        x = torch.mean(x, dim=0)

        # Pass through the fully connected layer
        x = self.fc(x)
        return x

# Initialize dataset and DataLoader
dataset = CustomDataset('vector_data.csv')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model
input_size = dataset[0]['input'].shape[0]
output_size = dataset[0]['target'].shape[0]
model = TransformerModel(input_size, output_size)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 11
losses = []  # List to store the losses

for epoch in range(num_epochs):
    epoch_losses = []  # List to store the losses for each epoch

    for batch in dataloader:
        inputs, targets = batch['input'], batch['target']

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        epoch_losses.append(loss.item())

    average_epoch_loss = np.mean(epoch_losses)
    losses.append(average_epoch_loss)

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_epoch_loss}')

end_time = start_time - time.time()
print('\n', f"Code executed in {end_time / 60} mins")
# Save the trained model
torch.save(model.state_dict(), 'transformer_model.pth')

In [None]:
import matplotlib.pyplot as plt
# Visualize training loss
plt.plot(range(1, num_epochs + 1), losses)
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(False)
plt.show()