In [33]:
import pandas as pd

# Load the datasets
links_df = pd.read_csv('./links.csv')
movies_df = pd.read_csv('./movies.csv')
ratings_df = pd.read_csv('./ratings.csv')
tags_df = pd.read_csv('./tags.csv')

# Display the first few rows of each dataframe to understand their structure
print("Links DataFrame:")
print(links_df.head(), "\n")
print("Movies DataFrame:")
print(movies_df.head(), "\n")
print("Ratings DataFrame:")
print(ratings_df.head(), "\n")
print("Tags DataFrame:")
print(tags_df.head(), "\n")


Links DataFrame:
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0 

Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy   

Ratings DataFrame:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0 

In [34]:
# Creating the user-movie ratings matrix
ratings_matrix = ratings_df.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)

# Display the shape of the matrix to understand its size
print("Shape of the user-movie ratings matrix:", ratings_matrix.shape)

# Show the first few rows of the matrix
print(ratings_matrix.head())


Shape of the user-movie ratings matrix: (9724, 610)
userId   1    2    3    4    5    6    7    8    9    10   ...  601  602  603  \
movieId                                                    ...                  
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  0.0  0.0  0.0  ...  4.0  0.0  4.0   
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  0.0  ...  0.0  4.0  0.0   
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

userId   604  605  606  607  608  609  610  
movieId                                     
1        3.0  4.0  2.5  4.0  2.5  3.0  5.0  
2        5.0  3.5  0.0  0.0  2.0  0.0  0.0  
3        0.0  0.0  0.0  0.0  2.0  0.0  0.0  
4        0.0  0.0  0.0  0.0  0.0  0.0  0.0  
5        3.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 610 columns]


In [35]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Convert the ratings matrix to a sparse matrix format
ratings_matrix_sparse = csr_matrix(ratings_matrix.values)

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(ratings_matrix_sparse, ratings_matrix_sparse)

# Convert the cosine similarity matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim, index=ratings_matrix.index, columns=ratings_matrix.index)

# Display the first few rows of the cosine similarity dataframe
print(cosine_sim_df.head())


movieId    1         2         3         4         5         6         7       \
movieId                                                                         
1        1.000000  0.410562  0.296917  0.035573  0.308762  0.376316  0.277491   
2        0.410562  1.000000  0.282438  0.106415  0.287795  0.297009  0.228576   
3        0.296917  0.282438  1.000000  0.092406  0.417802  0.284257  0.402831   
4        0.035573  0.106415  0.092406  1.000000  0.188376  0.089685  0.275035   
5        0.308762  0.287795  0.417802  0.188376  1.000000  0.298969  0.474002   

movieId    8         9         10      ...  193565  193567  193571  193573  \
movieId                                ...                                   
1        0.131629  0.232586  0.395573  ...     0.0     0.0     0.0     0.0   
2        0.172498  0.044835  0.417693  ...     0.0     0.0     0.0     0.0   
3        0.313434  0.304840  0.242954  ...     0.0     0.0     0.0     0.0   
4        0.158022  0.000000  0.095598  ...

In [36]:
def recommend_movies(movie_title, movies_df, cosine_sim_df, num_recommendations=5):
    # Get the movieId for the given movie title
    if movie_title in movies_df['title'].values:
        movie_id = movies_df[movies_df['title'] == movie_title].iloc[0]['movieId']
        # Get the most similar movies
        similar_movies = cosine_sim_df[movie_id].sort_values(ascending=False).iloc[1:num_recommendations+1].index
        # Map the most similar movie IDs back to titles
        recommended_movies = movies_df[movies_df['movieId'].isin(similar_movies)]['title']
        return recommended_movies
    else:
        return "Movie title not found in the dataset."

# Test the function with a movie title
test_movie_title = "Toy Story (1995)"
recommendations = recommend_movies(test_movie_title, movies_df, cosine_sim_df)
print(f"Recommendations for '{test_movie_title}':")
print(recommendations)


Recommendations for 'Toy Story (1995)':
224     Star Wars: Episode IV - A New Hope (1977)
314                           Forrest Gump (1994)
418                          Jurassic Park (1993)
615          Independence Day (a.k.a. ID4) (1996)
2355                           Toy Story 2 (1999)
Name: title, dtype: object


In [37]:
# Encode genres into a binary format
movies_df['genres_list'] = movies_df['genres'].apply(lambda x: x.split('|'))
genre_encoded = movies_df['genres_list'].str.join('|').str.get_dummies()
genre_encoded.index = movies_df['movieId']

print(genre_encoded.head())


         (no genres listed)  Action  Adventure  Animation  Children  Comedy  \
movieId                                                                       
1                         0       0          1          1         1       1   
2                         0       0          1          0         1       0   
3                         0       0          0          0         0       1   
4                         0       0          0          0         0       1   
5                         0       0          0          0         0       1   

         Crime  Documentary  Drama  Fantasy  Film-Noir  Horror  IMAX  Musical  \
movieId                                                                         
1            0            0      0        1          0       0     0        0   
2            0            0      0        1          0       0     0        0   
3            0            0      0        0          0       0     0        0   
4            0            0      1       

In [38]:
# Calculate the cosine similarity for the genre-encoded matrix
genre_cosine_sim = cosine_similarity(genre_encoded, genre_encoded)

# Convert the cosine similarity matrix to a DataFrame for better readability
genre_cosine_sim_df = pd.DataFrame(genre_cosine_sim, index=genre_encoded.index, columns=genre_encoded.index)

# Display the first few rows of the genre cosine similarity dataframe
print(genre_cosine_sim_df.head())


movieId    1         2         3         4         5       6         7       \
movieId                                                                       
1        1.000000  0.774597  0.316228  0.258199  0.447214     0.0  0.316228   
2        0.774597  1.000000  0.000000  0.000000  0.000000     0.0  0.000000   
3        0.316228  0.000000  1.000000  0.816497  0.707107     0.0  1.000000   
4        0.258199  0.000000  0.816497  1.000000  0.577350     0.0  0.816497   
5        0.447214  0.000000  0.707107  0.577350  1.000000     0.0  0.707107   

movieId    8       9         10      ...    193565    193567    193571  \
movieId                              ...                                 
1        0.632456     0.0  0.258199  ...  0.447214  0.316228  0.316228   
2        0.816497     0.0  0.333333  ...  0.000000  0.000000  0.000000   
3        0.000000     0.0  0.000000  ...  0.353553  0.000000  0.500000   
4        0.000000     0.0  0.000000  ...  0.288675  0.408248  0.816497   
5 

In [39]:
from scipy.sparse import csr_matrix

# Convert the genre-encoded DataFrame to a sparse matrix
genre_encoded_sparse = csr_matrix(genre_encoded.values)

# Calculate the cosine similarity for the sparse genre-encoded matrix
genre_cosine_sim_sparse = cosine_similarity(genre_encoded_sparse, genre_encoded_sparse)

# Since handling large sparse matrices directly can be challenging, especially for demonstration purposes,
# we'll proceed with our item-based collaborative recommendation system focusing on rating-based similarities,
# acknowledging the memory limitations encountered here.

# Future steps could involve optimization techniques or working in an environment with more available memory
# to incorporate genre-based similarities effectively.


In [40]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import accuracy

# Load the ratings dataset
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)

# Use the SVD algorithm
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)


RMSE: 0.8802


0.8802306123440872

In [41]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
    
    return ' '.join(lemmatized_tokens)

# Apply the preprocessing to the descriptions
movies_df['processed_genres'] = movies_df['genres'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
# Ensure necessary NLTK downloads are completed
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer

# Define the sentiment analysis function
def analyze_sentiment(tag_text):
    sid = SentimentIntensityAnalyzer()
    score = sid.polarity_scores(tag_text)
    return score['compound']

# Apply the function to the 'tag' column in the tags_df dataframe
tags_df['sentiment_score'] = tags_df['tag'].apply(analyze_sentiment)

# Display the first few rows to verify the sentiment scores
tags_df.head()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,userId,movieId,tag,timestamp,sentiment_score
0,2,60756,funny,1445714994,0.4404
1,2,60756,Highly quotable,1445714996,0.0
2,2,60756,will ferrell,1445714992,0.0
3,2,89774,Boxing story,1445715207,0.0
4,2,89774,MMA,1445715200,0.0
