## Data Ingestion And Transformation

In [1]:
import ast
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [2]:
movie_df = pd.read_csv('data/movie.csv')

In [3]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating_df = pd.read_csv('data/rating.csv')

In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
link_df = pd.read_csv('data/link.csv')

In [7]:
link_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
# TITLE ANALYSIS.
movie_df['title'].unique()[:10]

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)',
       'Heat (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)',
       'Sudden Death (1995)', 'GoldenEye (1995)'], dtype=object)

In [9]:
# MODIFY MOVIE-DF
movie_df = pd.merge(movie_df, link_df, on = 'movieId')

In [10]:
movie_df.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [11]:
movie_df = movie_df[['title', 'tmdbId']]

In [12]:
movie_df = movie_df.dropna(subset=['tmdbId']) # DROP NaN
movie_df['id'] = movie_df['tmdbId'].astype(int) # CONVERT FLOAT TO INT.
movie_df = movie_df.drop('tmdbId', axis=1)

movie_df.head()

Unnamed: 0,title,id
0,Toy Story (1995),862
1,Jumanji (1995),8844
2,Grumpier Old Men (1995),15602
3,Waiting to Exhale (1995),31357
4,Father of the Bride Part II (1995),11862


In [13]:
# TMDB DATASET.
import ast
# FUNCTION TO EXTRACT GENRES AND KEYWORD NAMES
def extract_name(obj):
    # Convert stringified list to Python object
    if isinstance(obj, str):  # Check if obj is a string
        obj = ast.literal_eval(obj)  # Safely convert string to Python list of dictionaries
    
    # Handle empty lists or None values
    if not obj:  # Check if obj is empty
        return []
    
    # Extract 'name' from each dictionary in the list
    names = [i['name'] for i in obj]
    return names

In [14]:
tmdb_df = pd.read_csv('data/tmdb_5000_movies.csv') 

tmdb_df['genres'] = tmdb_df['genres'].apply(extract_name) 
tmdb_df = tmdb_df[['id', 'original_title', 'genres']]

In [15]:
tmdb_df.head()

Unnamed: 0,id,original_title,genres
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]"
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]"
2,206647,Spectre,"[Action, Adventure, Crime]"
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]"
4,49529,John Carter,"[Action, Adventure, Science Fiction]"


In [16]:
# MERGE TMDB DATAFRAME AND MOVIE-DATAFRAME.
genres_df = pd.merge(movie_df, tmdb_df, on='id')

In [17]:
genres_df.head()

Unnamed: 0,title,id,original_title,genres
0,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
1,GoldenEye (1995),710,GoldenEye,"[Adventure, Action, Thriller]"
2,"American President, The (1995)",9087,The American President,"[Comedy, Drama, Romance]"
3,Nixon (1995),10858,Nixon,"[History, Drama]"
4,Cutthroat Island (1995),1408,Cutthroat Island,"[Action, Adventure]"


In [18]:
genres_df = genres_df.drop('title', axis=1)
genres_df['title'] = genres_df['original_title'] # CHANGE COLUMN NAME
genres_df = genres_df.drop('original_title', axis=1)

In [19]:
genres_df.head()

Unnamed: 0,id,genres,title
0,862,"[Animation, Comedy, Family]",Toy Story
1,710,"[Adventure, Action, Thriller]",GoldenEye
2,9087,"[Comedy, Drama, Romance]",The American President
3,10858,"[History, Drama]",Nixon
4,1408,"[Action, Adventure]",Cutthroat Island


In [20]:
# CONSIDER ONE GENRE FROM LIST OF GENERES.
genres_df['genres'] = genres_df['genres'].apply(lambda x: x[0] if isinstance(x, list) and x else None)

In [21]:
genres_df.head()

Unnamed: 0,id,genres,title
0,862,Animation,Toy Story
1,710,Adventure,GoldenEye
2,9087,Comedy,The American President
3,10858,History,Nixon
4,1408,Action,Cutthroat Island


In [22]:
genres_df['genres'].unique()

array(['Animation', 'Adventure', 'Comedy', 'History', 'Action', 'Drama',
       'Crime', 'Fantasy', 'Music', 'Horror', 'Thriller', 'Romance',
       'Science Fiction', 'Documentary', 'Family', 'War', 'Western',
       'Mystery', None, 'TV Movie', 'Foreign'], dtype=object)

In [23]:
import requests

# FETCH MOVIE POSTERS FROM TMDB-API
def fetch_poster(movie_id):
    api_key = '7b995d3c6fd91a2284b4ad8cb390c7b8'
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'
        
    try:
        # REQUEST
        response = requests.get(url)
            
        # FETCH-POSTER PATH.
        data = response.json()
        poster_path = data['poster_path']
        full_path = f"https://image.tmdb.org/t/p/w500{poster_path}"
            
        return full_path
    except Exception as e:
        pass

In [24]:
# Assuming fetch_poster function is already defined as shown earlier

# Get the list of genres
genres_list = genres_df['genres'].unique()
genres_list = genres_list[(genres_list != None) & (genres_list != 'TV Movie') & (genres_list != 'Foreign') & (genres_list != 'Western')]

# List to store valid movies
movies_list = []

# Iterate through each genre and select 50 valid movies
for genre in genres_list:
    genre_movies = genres_df[genres_df['genres'] == genre]['id'].drop_duplicates()

    valid_movies = []  # Store movies with valid posters
    
    for movie_id in genre_movies:
        poster = fetch_poster(movie_id)
        
        if poster:  # Only add movies with valid posters
            valid_movies.append(movie_id)

        if len(valid_movies) >= 50:  # Stop once 50 valid movies are collected
            break
    
    # Append the list of valid movies for the current genre
    movies_list.append(valid_movies)


In [25]:
# Flatten the list of lists into a single list of IDs
flat_movie_ids = [movie_id for sublist in movies_list for movie_id in sublist]

# Filter the original DataFrame to include only the selected movie IDs
filtered_movies_df = genres_df[genres_df['id'].isin(flat_movie_ids)]

# Display the new DataFrame
filtered_movies_df.head()

Unnamed: 0,id,genres,title
0,862,Animation,Toy Story
1,710,Adventure,GoldenEye
2,9087,Comedy,The American President
10,8012,Comedy,Get Shorty
14,9598,Fantasy,Babe


In [26]:
filtered_movies_df.shape

(637, 3)

In [27]:
# SAVE DAFA-FRAME.
import pickle
with open('artifacts/filtered_movies.pkl', 'wb') as file:
    pickle.dump(filtered_movies_df, file)

In [28]:
# Create 10 user IDs
user_ids = [f"user_{i+1}" for i in range(10)]

In [29]:
import random

user_ratings = []

# For each movie, assign ratings from at least 10 users
for movie_id in filtered_movies_df['id']:
    # Randomly select 10 users to rate this movie
    selected_users = random.sample(user_ids, 10)  # Ensure unique users
    for user_id in selected_users:
        # Generate random ratings between 1 and 5
        rating = random.choices([1, 2, 3, 4, 5], weights=[1, 2, 3, 3, 4], k=1)[0]
        user_ratings.append({'user_id': user_id, 'movie_id': movie_id, 'rating': rating})

# Convert user_ratings list into a DataFrame
user_ratings_df = pd.DataFrame(user_ratings)

# View the ratings DataFrame
print(user_ratings_df.head())


  user_id  movie_id  rating
0  user_8       862       5
1  user_4       862       4
2  user_9       862       3
3  user_6       862       2
4  user_5       862       5


In [30]:
user_ratings_df.shape

(6370, 3)

In [31]:
pivoted_df = user_ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating', aggfunc='first')

In [32]:
pivoted_df.head()

movie_id,11,12,13,18,38,66,85,89,105,106,...,172391,175574,193613,205596,209451,210577,228165,244403,270946,273895
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_1,2,4,3,2,2,2,5,5,4,4,...,3,1,4,2,4,5,4,3,3,4
user_10,5,4,3,5,4,5,2,4,2,5,...,5,4,5,1,3,1,5,5,5,5
user_2,2,3,5,5,3,5,3,5,2,4,...,3,5,2,5,3,3,5,5,1,4
user_3,5,5,5,4,4,2,2,5,5,2,...,2,3,4,3,5,3,4,5,4,1
user_4,2,5,4,3,2,4,3,2,2,4,...,4,5,4,4,4,2,5,4,2,5


In [33]:
with open('artifacts/pivoted_df.pkl', 'wb') as file:
    pickle.dump(pivoted_df, file)