In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
import joblib

In [None]:
df = pd.read_csv("/home/relationskatie/Downloads/dataset_23k_v2.csv")
df.head(2)

Unnamed: 0,id,title,genres,overview,production_companies,production_countries,release_date,runtime,vote_average,vote_count,cast,keywords,director,weight_rating
0,862,Toy Story,"[""Animation"", ""Comedy"", ""Family""]","Led by Woody, Andy's toys live happily in his ...","[""Pixar Animation Studios""]","[""United States of America""]",1995-10-30,81,7.7,5415,"[""Tom Hanks"", ""Tim Allen"", ""Don Rickles""]","[""jealousy"", ""toy"", ""boy""]",John Lasseter,7.401
1,8844,Jumanji,"[""Adventure"", ""Fantasy"", ""Family""]",When siblings Judy and Peter discover an encha...,"[""TriStar Pictures"", ""Teitler Film"", ""Intersco...","[""United States of America""]",1995-12-15,104,6.9,2413,"[""Robin Williams"", ""Jonathan Hyde"", ""Kirsten D...","[""board game"", ""disappearance"", ""based on chil...",Joe Johnston,6.592


#KNN

In [None]:
def get_str(x):
  return ' '.join(x[1: -1].replace('"', '').split(', '))

In [None]:
def clean_data(s: str):
  return str.lower(s.replace(',', '').replace('.', ''))

In [None]:
df['features'] = df['title'] + ' ' + df['overview'] + ' ' + df['genres'].apply(get_str) + ' ' + df['director'] + ' ' + df['cast'].apply(get_str) + ' ' + df['keywords'].apply(get_str)
df['features'] = df['features'].apply(clean_data)

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))
tfidf_matrix = tf.fit_transform(df['features'])

In [None]:
tf_cos = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
def get_recommendations(id: int, cosine_sim=tf_cos, popularity_threshold=0.65):
    # Находим индекс фильма
    idx = df[df['id'] == id].index[0]
    # Получаем косинусное сходство с другими фильмами
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Извлекаем индексы 10 наиболее похожих фильмов
    sim_scores = sim_scores[1:]  # Убираем сам фильм
    movie_indices = [i[0] for i in sim_scores]

    # Фильтруем фильмы по популярности (WR)
    popular_movies = df.iloc[movie_indices]
    popular_movies = popular_movies[popular_movies['weight_rating'] >= popularity_threshold][:10]

    # Возвращаем названия фильмов
    return popular_movies['title'].tolist()

In [None]:
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')

['tfidf_matrix.pkl']

In [None]:
joblib.dump(tf_cos, 'tf_cos.pkl')

['tf_cos.pkl']