In [99]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [100]:
df = pd.read_csv("preprocessed_data.csv")
df

Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country
0,The Shining,R,Drama,8.4,Stanley Kubrick,Stephen King,Jack Nicholson,Warner Bros.,1980,United States
1,The Blue Lagoon,R,Adventure,5.8,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,Columbia Pictures,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,8.7,Irvin Kershner,Leigh Brackett,Mark Hamill,Lucasfilm,1980,United States
3,Airplane!,PG,Comedy,7.7,Jim Abrahams,Jim Abrahams,Robert Hays,Paramount Pictures,1980,United States
4,Caddyshack,R,Comedy,7.3,Harold Ramis,Brian Doyle-Murray,Chevy Chase,Orion Pictures,1980,United States
...,...,...,...,...,...,...,...,...,...,...
3822,Birthday Girl,R,Comedy,6.1,Jez Butterworth,Tom Butterworth,Nicole Kidman,FilmFour,2002,United States
3823,Human Nature,R,Comedy,6.4,Michel Gondry,Charlie Kaufman,Tim Robbins,Fine Line Features,2001,France
3824,Made,R,Comedy,6.4,Jon Favreau,Jon Favreau,Vince Vaughn,Artisan Entertainment,2001,United States
3825,One Night at McCool's,R,Comedy,6.1,Harald Zwart,Stan Seidel,Liv Tyler,October Films,2001,United States


In [101]:
df.isnull().sum()

name        0
rating      0
genre       0
score       0
director    0
writer      0
star        0
company     0
year        0
country     0
dtype: int64

In [102]:
df['year'].unique()

array([1980, 1981, 1984, 1983, 1982, 1987, 1985, 1986, 1988, 1989, 1992,
       1990, 1991, 1993, 1999, 2016, 1997, 1994, 1995, 1996, 2001, 2000,
       1998, 2014, 2002, 2003, 2005], dtype=int64)

In [103]:
# Year to categorical value
bin_edges = [1970, 1980, 1990, 2000, 2010, 2020]
bin_labels = ['70s', '80s', '90s', '2000s', '2010s']
df['year'] = df['year'].astype(int)
df['year'] = pd.cut(df['year'], bins=bin_edges, labels=bin_labels, right=False)
print(df)

                                                name rating      genre  score  \
0                                        The Shining      R      Drama    8.4   
1                                    The Blue Lagoon      R  Adventure    5.8   
2     Star Wars: Episode V - The Empire Strikes Back     PG     Action    8.7   
3                                          Airplane!     PG     Comedy    7.7   
4                                         Caddyshack      R     Comedy    7.3   
...                                              ...    ...        ...    ...   
3822                                   Birthday Girl      R     Comedy    6.1   
3823                                    Human Nature      R     Comedy    6.4   
3824                                            Made      R     Comedy    6.4   
3825                           One Night at McCool's      R     Comedy    6.1   
3826                                  Dr. Dolittle 2     PG     Comedy    4.7   

             director      

In [104]:
df['features']=df[['rating','genre','score','director','writer','star','company','year','country']].astype(str).apply(' '.join, axis=1)
df.drop(['rating','genre','score','director','writer','star','company','year','country'],axis=1,inplace=True)
df

Unnamed: 0,name,features
0,The Shining,R Drama 8.4 Stanley Kubrick Stephen King Jack ...
1,The Blue Lagoon,R Adventure 5.8 Randal Kleiser Henry De Vere S...
2,Star Wars: Episode V - The Empire Strikes Back,PG Action 8.7 Irvin Kershner Leigh Brackett Ma...
3,Airplane!,PG Comedy 7.7 Jim Abrahams Jim Abrahams Robert...
4,Caddyshack,R Comedy 7.3 Harold Ramis Brian Doyle-Murray C...
...,...,...
3822,Birthday Girl,R Comedy 6.1 Jez Butterworth Tom Butterworth N...
3823,Human Nature,R Comedy 6.4 Michel Gondry Charlie Kaufman Tim...
3824,Made,R Comedy 6.4 Jon Favreau Jon Favreau Vince Vau...
3825,One Night at McCool's,R Comedy 6.1 Harald Zwart Stan Seidel Liv Tyle...


In [105]:
df['features'].isnull().sum()

0

In [106]:
vectorizer = CountVectorizer()
bow_features = vectorizer.fit_transform(df['features'])
bow_df = pd.DataFrame(bow_features.toarray(),columns=vectorizer.get_feature_names_out())
print(bow_df)

      13  1492  17  1818  1992  2000  2000s  2010s  21  21st  ...  éditions  \
0      0     0   0     0     0     0      0      0   0     0  ...         0   
1      0     0   0     0     0     0      0      0   0     0  ...         0   
2      0     0   0     0     0     0      0      0   0     0  ...         0   
3      0     0   0     0     0     0      0      0   0     0  ...         0   
4      0     0   0     0     0     0      0      0   0     0  ...         0   
...   ..   ...  ..   ...   ...   ...    ...    ...  ..   ...  ...       ...   
3822   0     0   0     0     0     0      1      0   0     0  ...         0   
3823   0     0   0     0     0     0      1      0   0     0  ...         0   
3824   0     0   0     0     0     0      1      0   0     0  ...         0   
3825   0     0   0     0     0     0      1      0   0     0  ...         0   
3826   0     0   0     0     0     0      1      0   0     0  ...         0   

      édouard  élie  élisabeth  émile  émilie  éric

In [107]:
df = pd.concat([df, bow_df], axis=1)

In [108]:
# def recommend_similar_movies:
#     cosine_sim_genre = cosine_similarity(genre_matrix,genre_matrix)
#     cosine_sim_genre_df = pd.DataFrame(cosine_sim_genre,columns=df['name'],index=df['name'])
#     cosine_sim_genre_df

In [116]:
cos_sim_matrix = cosine_similarity(bow_df)

In [121]:
def recommend_movies(movie_title, cos_sim_matrix, df):
    movie_index = df[df['name'] == movie_title].index.values[0]
    sim_scores = list(enumerate(cos_sim_matrix[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_recommendations = sim_scores[1:11]  
    movie_indices = [i[0] for i in top_recommendations]
    return df['name'].iloc[movie_indices]

In [122]:
similar_movies = recommend_movies('Human Nature', cos_sim_matrix, df)
print(similar_movies)

2111                                          Bob Roberts
3475                                     Cradle Will Rock
3818                                The Anniversary Party
2226                                           Short Cuts
2978                                                Gummo
3633                                       The Ladies Man
2499                                        Little Odessa
2715    The Incredibly True Adventure of Two Girls in ...
2887                                    Feeling Minnesota
3337                                 Being John Malkovich
Name: name, dtype: object
