In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/flight-price-prediction/business.csv
/kaggle/input/flight-price-prediction/economy.csv
/kaggle/input/flight-price-prediction/Clean_Dataset.csv


# Installing dependencies

In [9]:
!pip install datasets



In [10]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install bitsandbytes

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-5eg2f_gj
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-5eg2f_gj
  Resolved https://github.com/huggingface/transformers.git to commit 7f112caac2ff365c3d6e0020fefe8c1300311e07
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-req-build-wyqm9hz0
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-req-build-wyqm9hz0
  Resolved https://github.com/huggingface/accelerate.git to commit 4b4c036933f7c50fe3a7027b0380fcec53c6975e
  Installing build dependencies ... [?25ldone
[?25h  Ge

In [11]:
import torch
from transformers import AutoModel, AutoTokenizer,AutoModelForCausalLM,GenerationConfig
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from datasets import load_dataset
import numpy as np

In [12]:
from huggingface_hub import login

login("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",add_to_git_credential=True)

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading Model and Tokenizer

In [13]:
model_name = "meta-llama/Meta-Llama-3-8B" # Replace with correct model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,# loading the model with 4 bit precision, so as to show the example on Colab GPU
    device_map='auto'
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
tokenizer.pad_token = tokenizer.eos_token

# Loading Dataset

In [21]:
from datasets import load_dataset

ds = load_dataset("jquigl/imdb-genres")

In [22]:
train_data = ds['train']
val_data = ds['validation']

In [17]:
ds

DatasetDict({
    train: Dataset({
        features: ['movie title - year', 'genre', 'expanded-genres', 'rating', 'description'],
        num_rows: 238256
    })
    validation: Dataset({
        features: ['movie title - year', 'genre', 'expanded-genres', 'rating', 'description'],
        num_rows: 29809
    })
    test: Dataset({
        features: ['movie title - year', 'genre', 'expanded-genres', 'rating', 'description'],
        num_rows: 29756
    })
})

In [10]:
def create_prompt_classification(movie):
    return f"What is the genre of the movie with title: {movie['movie title - year']}',movie description: '{movie['description']}' and movie rating: {movie['rating']}?"

In [11]:
create_prompt_classification(train_data[0])

"What is the genre of the movie with title: Flaming Ears - 1992',movie description: 'Flaming Ears is a pop sci-fi lesbian fantasy feature set in the year 2700 in the fictive burned-out city of Asche. It follows the tangled lives of three women - Volley, Nun and Spy.' and movie rating: 6.0?"

## Preprocessing data

In [23]:
def remove_null_ratings(example):
    return example['rating'] is not None

In [24]:
filtered_train = train_data.filter(remove_null_ratings)
filtered_validation = val_data.filter(remove_null_ratings)

Filter:   0%|          | 0/238256 [00:00<?, ? examples/s]

Filter:   0%|          | 0/29809 [00:00<?, ? examples/s]

In [25]:
len(filtered_train),len(filtered_validation)

(168535, 21125)

In [26]:
train=filtered_train.shuffle().select(range(1000))
validation=filtered_validation.shuffle().select(range(100))

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [31]:
def extract_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
    return outputs.hidden_states


# Classification

In [19]:
from tqdm import tqdm
train_embeddings_firstLayer = []
train_embeddings_middlelayer= []
train_embeddings_lastLayer=[]
train_ratings = []
train_genres = []
print("Training start....")
for movie in tqdm(train,desc="Extracting",leave=True):
    prompt = create_prompt_classification(movie)
    embeddings = extract_embeddings(prompt, model, tokenizer)
    
    first_layer_embedding = embeddings[1][:,-1,:].squeeze().numpy()
    middle_layer_embedding = embeddings[16][:,-1,:].squeeze().numpy()
    final_layer_embedding = embeddings[-1][:,-1,:].squeeze().numpy()
    
    train_embeddings_firstLayer.append(first_layer_embedding)
    train_embeddings_middlelayer.append(middle_layer_embedding)
    train_embeddings_lastLayer.append(final_layer_embedding)
    train_ratings.append(movie['rating'])
    train_genres.append(movie['genre'])
print("Validation Start")

val_embeddings_firstLayer = []
val_embeddings_middlelayer = []
val_embeddings_lastLayer = []
val_ratings = []
val_genres=[]
for movie in tqdm(validation,desc="Extracting",leave=True):
    prompt = create_prompt_classification(movie)
    embeddings = extract_embeddings(prompt, model, tokenizer)
    
    first_layer_embedding = embeddings[1][:,-1,:].squeeze().numpy()
    middle_layer_embedding = embeddings[16][:,-1,:].squeeze().numpy()
    final_layer_embedding = embeddings[-1][:,-1,:].squeeze().numpy()
    
    val_embeddings_firstLayer.append(first_layer_embedding)
    val_embeddings_middlelayer.append(middle_layer_embedding)
    val_embeddings_lastLayer.append(final_layer_embedding)
    val_ratings.append(movie['rating']) 
    val_genres.append(movie['genre'])


Training start....


Extracting:   0%|          | 0/1000 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Extracting: 100%|██████████| 1000/1000 [12:55<00:00,  1.29it/s]


Validation Start


Extracting: 100%|██████████| 100/100 [01:14<00:00,  1.34it/s]


## Last Layer

In [20]:
label_encoder = LabelEncoder()
train_genres_encoded = label_encoder.fit_transform(train_genres)
val_genres_encoded = label_encoder.transform(val_genres)
X_train_genre=np.array(train_embeddings_lastLayer)
y_train_genre=np.array(train_genres_encoded)
X_val_genre=np.array(val_embeddings_lastLayer)
y_val_genre=np.array(val_genres_encoded)
classifier1 = RandomForestClassifier(n_estimators=100, random_state=42)
classifier1.fit(X_train_genre, y_train_genre)
genre_pred = classifier1.predict(X_val_genre)
accuracy = accuracy_score(y_val_genre, genre_pred)
print(f"Accuracy for genre prediction using Random Forest: {accuracy}")

Accuracy for genre prediction using Random Forest: 0.26


## Middle Layer

In [21]:
label_encoder = LabelEncoder()
train_genres_encoded = label_encoder.fit_transform(train_genres)
val_genres_encoded = label_encoder.transform(val_genres)
X_train_genre=np.array(train_embeddings_middlelayer)
y_train_genre=np.array(train_genres_encoded)
X_val_genre=np.array(val_embeddings_middlelayer)
y_val_genre=np.array(val_genres_encoded)
classifier1 = RandomForestClassifier(n_estimators=100, random_state=42)
classifier1.fit(X_train_genre, y_train_genre)
genre_pred = classifier1.predict(X_val_genre)
accuracy = accuracy_score(y_val_genre, genre_pred)
print(f"Accuracy for genre prediction using Random Forest: {accuracy}")

Accuracy for genre prediction using Random Forest: 0.24


## First layer

In [23]:
label_encoder = LabelEncoder()
train_genres_encoded = label_encoder.fit_transform(train_genres)
val_genres_encoded = label_encoder.transform(val_genres)
X_train_genre=np.array(train_embeddings_firstLayer)
y_train_genre=np.array(train_genres_encoded)
X_val_genre=np.array(val_embeddings_firstLayer)
y_val_genre=np.array(val_genres_encoded)
classifier1 = RandomForestClassifier(n_estimators=100, random_state=42)
classifier1.fit(X_train_genre, y_train_genre)
genre_pred = classifier1.predict(X_val_genre)
accuracy = accuracy_score(y_val_genre, genre_pred)
print(f"Accuracy for genre prediction using Random Forest: {accuracy}")

Accuracy for genre prediction using Random Forest: 0.12


# Regression

In [27]:
ds

DatasetDict({
    train: Dataset({
        features: ['movie title - year', 'genre', 'expanded-genres', 'rating', 'description'],
        num_rows: 238256
    })
    validation: Dataset({
        features: ['movie title - year', 'genre', 'expanded-genres', 'rating', 'description'],
        num_rows: 29809
    })
    test: Dataset({
        features: ['movie title - year', 'genre', 'expanded-genres', 'rating', 'description'],
        num_rows: 29756
    })
})

In [28]:
def create_prompt_regression (movie):
    return f"Given the movie title: '{movie['movie title - year']}',movie genre: '{movie['genre']}' and movie description: '{movie['description']}',what is the rating of the movie out of 10?"

In [32]:
from tqdm import tqdm
train_embeddings_firstLayer = []
train_embeddings_middlelayer= []
train_embeddings_lastLayer=[]
train_ratings = []
train_genres = []
print("Training start....")
for movie in tqdm(train,desc="Extracting",leave=True):
    prompt = create_prompt_regression(movie)
    embeddings = extract_embeddings(prompt, model, tokenizer)
    
    first_layer_embedding = embeddings[1][:,-1,:].squeeze().numpy()
    middle_layer_embedding = embeddings[16][:,-1,:].squeeze().numpy()
    final_layer_embedding = embeddings[-1][:,-1,:].squeeze().numpy()
    
    train_embeddings_firstLayer.append(first_layer_embedding)
    train_embeddings_middlelayer.append(middle_layer_embedding)
    train_embeddings_lastLayer.append(final_layer_embedding)
    train_ratings.append(movie['rating'])
    train_genres.append(movie['genre'])
print("Validation Start")

val_embeddings_firstLayer = []
val_embeddings_middlelayer = []
val_embeddings_lastLayer = []
val_ratings = []
val_genres=[]
for movie in tqdm(validation,desc="Extracting",leave=True):
    prompt = create_prompt_regression(movie)
    embeddings = extract_embeddings(prompt, model, tokenizer)
    
    first_layer_embedding = embeddings[1][:,-1,:].squeeze().numpy()
    middle_layer_embedding = embeddings[16][:,-1,:].squeeze().numpy()
    final_layer_embedding = embeddings[-1][:,-1,:].squeeze().numpy()
    
    val_embeddings_firstLayer.append(first_layer_embedding)
    val_embeddings_middlelayer.append(middle_layer_embedding)
    val_embeddings_lastLayer.append(final_layer_embedding)
    val_ratings.append(movie['rating']) 
    val_genres.append(movie['genre'])


Training start....


Extracting:   0%|          | 0/1000 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Extracting: 100%|██████████| 1000/1000 [13:00<00:00,  1.28it/s]


Validation Start


Extracting: 100%|██████████| 100/100 [01:17<00:00,  1.30it/s]


## Last Layer

In [33]:
X_train=np.array(train_embeddings_lastLayer)
y_train=np.array(train_ratings)
X_val=np.array(val_embeddings_lastLayer)
y_val=np.array(val_ratings)

regressor1= LinearRegression()
regressor1.fit(X_train, y_train)

# Predict and evaluate the regression model
y_pred = regressor1.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error (MSE) for rating prediction: {mse}")

Mean Squared Error (MSE) for rating prediction: 3.042576080322265


## Middle Layer

In [34]:
X_train=np.array(train_embeddings_middlelayer)
y_train=np.array(train_ratings)
X_val=np.array(val_embeddings_middlelayer)
y_val=np.array(val_ratings)

regressor1= LinearRegression()
regressor1.fit(X_train, y_train)

# Predict and evaluate the regression model
y_pred = regressor1.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error (MSE) for rating prediction: {mse}")

Mean Squared Error (MSE) for rating prediction: 3.7705216369628904


## First Layer

In [35]:
X_train=np.array(train_embeddings_firstLayer)
y_train=np.array(train_ratings)
X_val=np.array(val_embeddings_firstLayer)
y_val=np.array(val_ratings)

regressor1= LinearRegression()
regressor1.fit(X_train, y_train)

# Predict and evaluate the regression model
y_pred = regressor1.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error (MSE) for rating prediction: {mse}")

Mean Squared Error (MSE) for rating prediction: 5.48046484375
