In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.vectorstores import FAISS


In [None]:
import pandas as pd
movies = pd.read_csv('cleand_movie.csv')
# movies['description'].to_csv('description.txt', index=False,header=False,sep='\n')

In [None]:
raw_document = TextLoader('description.txt', encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0,separator='\n')
document = text_splitter.split_documents(raw_document)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db_movies = FAISS.from_documents(document, embeddings)

In [None]:
# db_movies.save_local("faiss_movies_index")
query = "an ethiopian guy go to prison"
db_movies.similarity_search(query, k=5)

In [None]:
import pandas as pd

def retrive_result(query: str, top_k: int = 16) -> pd.DataFrame:
    # search similar docs
    recs = db_movies.similarity_search(query, k=top_k)

    matched_texts = [rec.page_content.strip('"') for rec in recs]

    # filter your original dataframe (replace `movies` with your real DataFrame)
    return matched_texts


In [None]:
df = retrive_result('world war 2')

In [None]:
category_mapping = {
      # Light & Entertaining
    'Comedy': 'Light & Entertaining',
    'Animation': 'Light & Entertaining',
    'Family': 'Light & Entertaining',
    'Musical': 'Light & Entertaining',
    'Music': 'Light & Entertaining',
    'Romance': 'Light & Entertaining',
    'Action': 'Light & Entertaining',
    'Adventure': 'Light & Entertaining',
    'Fantasy': 'Light & Entertaining',
    'Sport': 'Light & Entertaining',
    'TV Movie': 'Light & Entertaining',
    'Reality-TV': 'Light & Entertaining',
    'Talk-Show': 'Light & Entertaining',

    # Dark / Serious / Realistic
    'Drama': 'Dark / Serious / Realistic',
    'Biography': 'Dark / Serious / Realistic',
    'History': 'Dark / Serious / Realistic',
    'War': 'Dark / Serious / Realistic',
    'Thriller': 'Dark / Serious / Realistic',
    'Crime': 'Dark / Serious / Realistic',
    'Mystery': 'Dark / Serious / Realistic',
    'Horror': 'Dark / Serious / Realistic',
    'Documentary': 'Dark / Serious / Realistic',
    'Sci-Fi': 'Dark / Serious / Realistic',
    'Science Fiction': 'Dark / Serious / Realistic',
    'Western': 'Dark / Serious / Realistic',
    'Short': 'Dark / Serious / Realistic',
    'Adult': 'Dark / Serious / Realistic',
}

movies['simple_category'] = movies['categories'].map(category_mapping)

In [None]:
movies['simple_category'].value_counts()


In [None]:
from transformers import pipeline
fiction_catigories = ['Light & Entertaining','Dark / Serious / Realistic']


pipe = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"  # ✅ public model
)


In [None]:
sequence = movies.loc[movies['simple_category'] == 'Light & Entertaining','description'].reset_index(drop=True)[5]
pipe(sequence,fiction_catigories)

In [None]:
import numpy as np
def define_catigories(sequence,category):
    prediction = pipe(sequence,category)
    max_label = np.argmax(prediction['scores'])
    max_inex = prediction['labels'][max_label]

    return max_inex

In [None]:
from tqdm import tqdm
actaul_cat = []
predicted_cat = []
for i in tqdm(range(0,300)):
    sequence = movies.loc[movies['simple_category'] == 'Light & Entertaining','description'].reset_index(drop=True)[i]
    predicted_cat+= [define_catigories(sequence,fiction_catigories)]
    actaul_cat += ['Light & Entertaining']



In [None]:
actaul_cat = []
predicted_cat = []
for i in tqdm(range(0,300)):
    sequence = movies.loc[movies['simple_category'] == 'Dark / Serious / Realistic','description'].reset_index(drop=True)[i]
    predicted_cat+= [define_catigories(sequence,fiction_catigories)]
    actaul_cat += ['Dark / Serious / Realistic']


In [None]:
predicted_df = pd.DataFrame({'actaul_cat':actaul_cat,'predicted_cat':predicted_cat})

In [None]:
predicted_df['correct_prediction'] = np.where(predicted_df['actaul_cat'] == predicted_cat,1,0)
predicted_df['correct_prediction']

In [None]:
predicted_df['correct_prediction'].sum()/len(predicted_df)