In [44]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
path = "data/ml_movies_data.csv"

movies = pd.read_csv(path)

In [10]:
def get_column_summary(dataset, column_name):
    """
    Generates a summary of data types and their quantities in a column.

    Args:
        dataset: The DataFrame containing the column.
        column_name: The name of the column.

    Returns:
        A Series containing the data types as the index and their respective counts.
    """
    column_summary = dataset[column_name].apply(type).value_counts()
    return column_summary, "total values:", dataset.shape[0]

## Filter by collection

In [46]:
#movies = movies[["id", "title", "collection_name", "movie_genres", "overview", "tagline"]]
movies_by_collection = movies[["id", "title", "collection_name"]]

In [47]:
# Summary of the data types
get_column_summary(movies, "collection_name")

(collection_name
 <class 'float'>    40861
 <class 'str'>       4485
 Name: count, dtype: int64,
 'total values:',
 45346)

In [49]:
# Creating a new dataset without the NaN values
collections = movies_by_collection.dropna(subset=["collection_name"])

In [51]:
# Verifying changes
get_column_summary(collections, "collection_name")

(collection_name
 <class 'str'>    4485
 Name: count, dtype: int64,
 'total values:',
 4485)

## Genre and rating

In [84]:
# Creating a dataset
movies_by_gen_rat = movies[["id", "title", "movie_genres", "vote_average", "vote_count", "tagline"]]

In [85]:
get_column_summary(movies_by_gen_rat, "vote_average")

(vote_average
 <class 'float'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

In [86]:
temp_dataset = movies_by_gen_rat.copy()
temp_dataset.loc[:, "to_delete"] = np.nan  # Use .loc to assign NaN values to the column

for idx, (votes, rating) in enumerate(zip(temp_dataset["vote_count"], temp_dataset["vote_average"])):
    if votes == 0 and rating == 0:
        temp_dataset.at[idx, "to_delete"] = 1

movies_by_gen_rat_1 = temp_dataset[temp_dataset['to_delete'] != 1][["id", "title", "movie_genres", "vote_average", "tagline"]]

In [99]:
# Creating a new dataset without the NaN values
movies_by_gen_rat_2 = movies_by_gen_rat_1.dropna(subset=["tagline"])
movies_by_gen_rat_2.head()

Unnamed: 0,id,title,movie_genres,vote_average,tagline
1,8844,Jumanji,"['Adventure', 'Fantasy', 'Family']",6.9,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,"['Romance', 'Comedy']",6.5,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",6.1,Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,['Comedy'],5.7,Just When His World Is Back To Normal... He's ...
5,949,Heat,"['Action', 'Crime', 'Drama', 'Thriller']",7.7,A Los Angeles Crime Saga


In [100]:
# Creating a new dataset using a list of genres

list_genres = ["Adventure", "Romance"]

movies_by_gen_rat_2 = movies_by_gen_rat_2[movies_by_gen_rat_2['movie_genres'].apply(lambda x: any(genre in x for genre in list_genres))][['id', 'title', 'movie_genres', 'vote_average']]

In [101]:
movies_by_gen_rat_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5203 entries, 1 to 45318
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            5203 non-null   int64  
 1   title         5203 non-null   object 
 2   movie_genres  5203 non-null   object 
 3   vote_average  5203 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 203.2+ KB


In [96]:
for value in movies_by_gen_rat_2["movie_genres"]:
    if value == []:
        print(value)

## Model

In [35]:
count_vector = CountVectorizer(max_features=1000, stop_words='english')

In [40]:
vector = count_vector.fit_transform(movies["overview"].values.astype('U')).toarray() #.values.astype('U')

In [None]:
similarity = cosine_similarity(vector)

In [None]:
movies[movies['title']=="The Godfather"].index[0]

In [None]:
distance = sorted(list(enumerate(similarity[2])), reverse=True, key=lambda vector:vector[1])
for i in distance[0:5]:
    print(movies.iloc[i[0]].title)



In [None]:


def recommand(movies):
    index=new_data[new_data['title']==movies].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[0:5]:
        print(new_data.iloc[i[0]].title)



In [None]:


recommand("Iron Man")

