# **Get data**

In [None]:
! mkdir ~/.kaggle

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [None]:
!kaggle datasets download narayan63/netflix-popular-movies-dataset

Dataset URL: https://www.kaggle.com/datasets/narayan63/netflix-popular-movies-dataset
License(s): CC0-1.0
Downloading netflix-popular-movies-dataset.zip to /content
  0% 0.00/1.17M [00:00<?, ?B/s]
100% 1.17M/1.17M [00:00<00:00, 82.5MB/s]


In [None]:
! unzip netflix-popular-movies-dataset

Archive:  netflix-popular-movies-dataset.zip
  inflating: n_movies.csv            


# *Imports*

In [None]:
import numpy as np
import pandas as pd
import os
import re
import random
import json
import string
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from pathlib import Path

# Data preparation

In [None]:
df = pd.read_csv('n_movies.csv')

In [None]:
df.head()

Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413


In [None]:
df.shape

(9957, 9)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  6504 non-null   object 
 3   duration     7921 non-null   object 
 4   genre        9884 non-null   object 
 5   rating       8784 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        8784 non-null   object 
dtypes: float64(1), object(8)
memory usage: 700.2+ KB


In [None]:
df = df[['title', 'year', 'genre', 'description', 'stars']]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        9957 non-null   object
 1   year         9430 non-null   object
 2   genre        9884 non-null   object
 3   description  9957 non-null   object
 4   stars        9957 non-null   object
dtypes: object(5)
memory usage: 389.1+ KB


In [None]:
df['year'] = df['year'].fillna('')
df['genre'] = df['genre'].fillna('')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        9957 non-null   object
 1   year         9957 non-null   object
 2   genre        9957 non-null   object
 3   description  9957 non-null   object
 4   stars        9957 non-null   object
dtypes: object(5)
memory usage: 389.1+ KB


# Preprocessing

## Data harmonization

In [None]:
def preproc_stars(stars):
    stars = re.sub(r"[\[\]']",'', stars)
    stars = stars.split(',')
    stars = [re.sub(r'''["',]''', '', act.strip()) for act in stars]
    stars = ', '.join([act for act in stars if act if stars!=''])
    return f'Stars: {stars}'


def preproc_year(year):
    year = re.findall(r'\d{4}', year)
    try:
        return year[0]
    except IndexError:
        return np.NaN


df['clean_year'] = df['year'].apply(preproc_year)
df = df[df['clean_year'].notna()]
df['clean_stars'] = df['stars'].apply(preproc_stars)
df['clean_title'] = df['title'].apply(lambda title: title.lower())

NameError: name 'df' is not defined

In [None]:
prep_df = df[['clean_title', 'clean_stars', 'clean_year', 'genre', 'text']]

NameError: name 'df' is not defined

## One-hot encodings for "genre"

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

prep_df['genre'] = prep_df['genre'].apply(lambda elem: [gen.strip() for gen in elem.split(',')])

mlb = MultiLabelBinarizer()

df_genres = pd.DataFrame(mlb.fit_transform(prep_df['genre']), columns=mlb.classes_, index=df.index)
prep_df = prep_df.drop('genre', axis=1).join(df_genres)

## Clean texts

In [None]:
def preproc_desc(text):
    if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text:
        text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "url", text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\s+', ' ', text)

    return text.lower()

df['text'] = df['description'].apply(preproc_desc)

In [None]:
prep_df = prep_df.drop_duplicates(subset=['text']).reset_index(drop=True)

NameError: name 'prep_df' is not defined

In [None]:
prep_df['full_text'] = pd.Series([f"{text}. {stars}" for text, stars in zip(prep_df['text'], prep_df['clean_stars'])])

In [None]:
prep_df = prep_df.drop(['text', 'clean_stars'], axis=1)

In [None]:
prep_df.info()

## Make input and output columns

In [None]:
input_columns = [name for name in prep_df.columns if name!='full_text']

prep_df['input'] = prep_df.apply(
          lambda row: ' '.join([f" <{name}> {row[name]}" for name in input_columns]) + ' ' + f"{tokenizer.eos_token}", axis=1)
prep_df['output'] = prep_df.apply(lambda row: f" <full_text> {row['full_text']} {tokenizer.eos_token}", axis=1)

train_data = prep_df[['input', 'output']].copy()

## Load data

In [None]:
data_path = "/content/"
data_path = Path(data_path)
dataset_path = data_path / 'data.txt'
n = 0
with dataset_path.open('a', encoding='utf-8') as file:
    for input_text, target_text in zip(train_data['input'], train_data['output']):
      file.write(input_text + ' ' + target_text + '\n')
      n+=1
      if n>15:
          break

In [None]:
print(train_data.columns)

Index(['input', 'output'], dtype='object')


# Create generation model

# Generation model

## Get model

In [None]:
model_name='gpt2',
cache_dir='model_cache'
output_name='fine_tuned_model'
num_train_epochs=4
per_device_train_batch_size=4
learning_rate=5e-5
save_steps=10_000

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## Model training

In [None]:
train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=str(dataset_path),
            block_size=256
            )

data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=False
            )

training_args = TrainingArguments(
            output_dir=str(data_path / output_name),
            overwrite_output_dir=True,
            label_names = ['full_text'],
            num_train_epochs=2,
            per_device_train_batch_size=4,
            save_steps=save_steps,
            learning_rate=learning_rate,
            save_total_limit=2,
            logging_dir=str(data_path / 'logs'),
)

trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
)


trainer.train()
model.save_pretrained(str(data_path / output_name))
tokenizer.save_pretrained(str(data_path / output_name))




Step,Training Loss


('/content/fine_tuned_model/tokenizer_config.json',
 '/content/fine_tuned_model/special_tokens_map.json',
 '/content/fine_tuned_model/vocab.json',
 '/content/fine_tuned_model/merges.txt',
 '/content/fine_tuned_model/added_tokens.json')

## Load model

In [None]:
model_path = Path(data_path) / output_name
tokenizer = GPT2Tokenizer.from_pretrained(str(model_path))
model = GPT2LMHeadModel.from_pretrained(str(model_path))
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# Generate text

In [None]:
def generate_desc(title: str, year: int, genre: list,
                    model=model,
                    max_length=100,
                    num_return_sequences=1,
                    temperature=1.0,
                    top_k=0,
                    top_p=1.0,
                    do_sample=False):

        genres_names = ['Action', 'Adventure', 'Animation',
       'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music',
       'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi',
       'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']

        data = {g: 1 if g in genre else 0 for g in genres_names}
        data['clean_title'] = title
        data['clean_year'] = year


        # Формирование prompt
        prompt_text = ''.join([f"{k} {v} " for k, v in data.items()]) + f"{tokenizer.eos_token} <full_text> "

        # Кодирование текста в формате, пригодном для модели
        encoded_input = tokenizer.encode(prompt_text, return_tensors='pt')

        # Генерация текстов
        outputs = model.generate(
            encoded_input,
            max_length=max_length + len(encoded_input[0]),
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=do_sample,
            no_repeat_ngram_size=2
        )

        # Декодирование результатов
        all_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        # Удаление входных данных из текстов
        prompt_length = len(tokenizer.decode(encoded_input[0], skip_special_tokens=True))
        trimmed_texts = [text[prompt_length:] for text in all_texts]

        # Возврат результатов в виде словаря
        return trimmed_texts

In [None]:
unique_title = prep_df['clean_title'].unique().tolist()
unique_year = prep_df['clean_year'].unique().tolist()
unique_genre = df['genre'].unique().tolist()

In [None]:
unique_genre[0]

'Action, Comedy, Drama'

## Use input data from dataset to test work of generation model

In [None]:
title = unique_title[1]
year = unique_year[1]
genre = ['Action', 'Comedy', 'Drama']


generated_texts = generate_desc(
    title=title,
    year=year,
    genre=genre,
    max_length=100,
    num_return_sequences=3,
    do_sample=True,
    temperature=0.95,  # Слегка уменьшаем уверенность
    top_k=10,         # Уменьшаем количество рассматриваемых верхних k слов
    top_p=0.95        # Уменьшаем "ядерность" распределения
)
print(generated_texts[0])

  This book is written in the style of a story.  A short, but powerful tale of love and tragedy that follows the life of one single mother.     http://www.bibliothèque-historique.fr/p/1d10/  Miele de Cunha-Mieu  (1927)  *  ~ 
<link rel="short" type="


## The generation model has quite good text. To make the text better suited for a particular task, you can try changing the parameters:
## - max_length (maximum length of text)
## - temperature (text sentiment)
## - top_k, top_p (stratigies)

# Thank you for your attention! ✊

---
