In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [5]:
# Load the MovieLens 100K dataset
url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
columns = ['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
df_movies = pd.read_csv(url, sep='|', names=columns, encoding='latin-1')

# Drop unnecessary columns
df_movies = df_movies[['item_id', 'title', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]

# Merge movie and ratings data
url_ratings = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data'
columns_ratings = ['user_id', 'item_id', 'rating', 'timestamp']
df_ratings = pd.read_csv(url_ratings, sep='\t', names=columns_ratings)

df = pd.merge(df_ratings, df_movies, on='item_id')
df = df.head(10000)

## Example of Linear Kernel in Movie Recommendations

In the context of movie recommendations using content-based filtering with a linear kernel, let's consider a simplified scenario where movies are represented by two features: "Action" and "Comedy". These features indicate whether a movie belongs to the Action or Comedy genre.

Assuming we have two movies:

1. **Movie X:** Action = 1, Comedy = 0
2. **Movie Y:** Action = 0, Comedy = 1

The feature vectors representing the movies are:

X = [1, 0]

Y = [0, 1]

Now, let's calculate the linear kernel between these two movies:

K(X, Y) = X * Y = (1 * 0) + (0 * 1) = 0

In this case, the linear kernel is 0, indicating that the movies are dissimilar in terms of the "Action" and "Comedy" features.

Now, let's consider two other movies:

3. **Movie Z:** Action = 1, Comedy = 1
4. **Movie W:** Action = 1, Comedy = 1

The feature vectors are:

Z = [1, 1]

W = [1, 1]

Now, calculate the linear kernel between these two movies:

K(Z, W) = Z * W = (1 * 1) + (1 * 1) = 2

In this case, the linear kernel is 2, indicating that the movies are more similar in terms of the "Action" and "Comedy" features compared to the previous pair.


In [6]:

# Convert genres to a space-separated string
df['genres'] = df.iloc[:, 2:].apply(lambda x: ' '.join(x.index[x == 1]), axis=1)

# TF-IDF Vectorization of movie genres
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
# Function to get movie recommendations based on content
def get_content_based_recommendations(movie_title, cosine_sim_matrix, df_movies):
    # Get the index of the movie in the dataframe
    idx = df_movies.index[df_movies['title'] == movie_title].tolist()[0]

    # Get the pairwise similarity scores with other movies
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the movie indices of the top 10 similar movies
    movie_indices = [i[0] for i in sim_scores[1:11]]

    # Return the top 10 similar movies
    return df_movies['title'].iloc[movie_indices]

In [9]:
# Example: Get content-based recommendations for a movie
movie_to_recommend = 'Get Shorty (1995)'
content_based_recommendations = get_content_based_recommendations(movie_to_recommend, cosine_sim, df_movies)

print(f"Content-based recommendations for '{movie_to_recommend}':")
print(content_based_recommendations)

Content-based recommendations for 'Get Shorty (1995)':
1                                      GoldenEye (1995)
2                                     Four Rooms (1995)
3                                     Get Shorty (1995)
4                                        Copycat (1995)
5     Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6                                 Twelve Monkeys (1995)
7                                           Babe (1995)
8                               Dead Man Walking (1995)
9                                    Richard III (1995)
10                                 Seven (Se7en) (1995)
Name: title, dtype: object
