In [7]:
%pip install pandas # to work with out data (load, clean, analyze)
%pip install dotenv # to load out enviroment variabels

In [8]:
# pip install pandas
import os # we need this for getting env variables here
import pandas as pd
from  dotenv import load_dotenv 
load_dotenv()

# load the csv file
dataset_path = os.getenv("DATASET_PATH")
movies_df = pd.read_csv(dataset_path)

In [9]:
# show the top 5 rows
movies_df.head(1)
# movies_df.dtypes
#movies_df.info()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."


In [10]:
# now I need the id,title, overview, vote_average, tagline, genres
selected_columns = [
    "id",                     # unique identifier
    "title",                  # movie title
    "overview",               # movie description (for NLP/text similarity)
    "genres",                 # main content categories
    "keywords",               # detailed thematic tags
    "tagline",                # short punchline (optional but enriching)
    "vote_average",           # user score (filtering or ranking)
    "vote_count",             # popularity measure
    "release_date",           # for recency filtering/sorting
    "runtime",                # length, optional for personalization
    "original_language",      # filter by language
    "poster_path",              # for UI display (optional)
    "popularity",
]

movies_df_filtered = movies_df[selected_columns]

movies_df_filtered.head(1)


Unnamed: 0,id,title,overview,genres,keywords,tagline,vote_average,vote_count,release_date,runtime,original_language,poster_path,popularity
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,8.364,34495,2010-07-15,148,en,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,83.952


In [11]:
# Check basic stats to choose a cutoff
#print(movies_df_filtered["popularity"].describe())

In [12]:
movies_df_filtered = movies_df_filtered.loc[:, ~movies_df_filtered.columns.duplicated()]
high_rating_movies = movies_df_filtered
print(f"Number of movies with rating > 5.8: {len(high_rating_movies)}")
high_rating_movies.head(2)

Number of movies with rating > 5.8: 1239509


Unnamed: 0,id,title,overview,genres,keywords,tagline,vote_average,vote_count,release_date,runtime,original_language,poster_path,popularity
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,8.364,34495,2010-07-15,148,en,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,83.952
1,157336,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction","rescue, future, spacecraft, race against time,...",Mankind was born on Earth. It was never meant ...,8.417,32571,2014-11-05,169,en,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,140.241


In [13]:
movie_vote_count = high_rating_movies["vote_count"] < 100

# Filter rows where vote_count >= 100
filtered_movies = high_rating_movies[movie_vote_count]

# Display the titles
#print(filtered_movies["title"].tolist())
filtered_movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,tagline,vote_average,vote_count,release_date,runtime,original_language,poster_path,popularity
18191,138376,A Dark Truth,"In the jungles of Ecuador, blood taints the wa...","Action, Thriller","corruption, water purification, epiphany, ecua...",Escape the jungle. Expose the truth.,5.495,99,2012-11-29,106,en,/je93UwdBdhtHpp9YgaYGnJqnX8G.jpg,8.61
18192,31701,Cop,"An obsessive, insubordinate homicide cop is co...","Crime, Drama, Mystery, Thriller","daughter, high school, prostitute, rape, based...",A killer on the loose. A cop on the edge.,6.2,99,1988-02-05,110,en,/aj3PtDyFjBKfkxglcJp1dpCLEeM.jpg,8.636


In [14]:
high_rating_movies = high_rating_movies[
    (high_rating_movies["vote_average"] > 5.8) &
    (high_rating_movies["overview"].notnull()) &
    (high_rating_movies["genres"].notnull()) &
    (high_rating_movies["vote_count"] >= 100)
]
print(len(high_rating_movies))

14637


In [15]:
movies_df.iloc[0,1]

print(movies_df.loc[0, "overview"])


Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: "inception", the implantation of another person's idea into a target's subconscious.


### Before converting our data in to JSON format we need to clean the data.
- Preview data	df.head(), df.info()
- Remove unnecessary cols	
- Handle missing data	
- Fix types	
- Remove duplicates	
- Normalize values	
- Clean text	
- Verify	
- Save

In [16]:
#high_rating_movies.info()

# Here’s what the info means in terms of nulls:

id — no nulls (all 1,239,509 entries have an id)

title — some nulls, since 1,239,496 < 1,239,509 (so about 13 missing)

overview — lots of nulls, only 974,585 non-null out of 1,239,509 (so about 264,924 missing)

vote_average - only 173964 avilable

tagline — even more nulls, only 173,964 non-null (about 1,065,545 missing)

genres — about half missing, 722,288 non-null (about 517,221 missing)



In [17]:
# Drop rows missing overview (must-have)
high_rating_movies = high_rating_movies.dropna(subset=['overview'])

# Fill missing tagline with empty string (lower priority)
high_rating_movies['tagline'] = high_rating_movies['tagline'].fillna('')

# Fill missing genres with empty list or 'unknown'
high_rating_movies['genres'] = high_rating_movies['genres'].fillna('unknown')

# Optional: fill missing titles if any
high_rating_movies['title'] = high_rating_movies['title'].fillna('Unknown Title')

#high_rating_movies.info()

In [18]:
#high_rating_movies["genres"]

In [19]:
#high_rating_movies["tagline"]

In [20]:
movies_df_filtered.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,vote_average,vote_count,release_date,runtime,original_language,poster_path,popularity
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,8.364,34495,2010-07-15,148,en,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,83.952


In [21]:
# split(',') methods remove the comma and make a list out of genre
# The strip() method removes any leading, and trailing whitespaces 

def clean_genres(genres_str):
    if isinstance(genres_str, str):
        # For null or missing values, return empty list
        if genres_str.lower() == "unknown" or not genres_str.strip():
            return []
        # Split by comma, strip spaces, and convert to lowercase
        return [g.strip().lower() for g in genres_str.split(',')]
    return []

# Apply it to your dataframe
high_rating_movies['genres'] = high_rating_movies['genres'].apply(clean_genres) 
high_rating_movies.tail()    

Unnamed: 0,id,title,overview,genres,keywords,tagline,vote_average,vote_count,release_date,runtime,original_language,poster_path,popularity
18184,110112,Nitro Circus: The Movie,You will see Travis Pastrana and the whole Nit...,"[action, comedy, documentary]","stunt, motocross, mayhem",,6.41,100,2012-08-08,92,en,/ofDDFfZ40dnGUPyjaDSPwqCQ12K.jpg,6.076
18185,669363,The Man Who Sold His Skin,To be able to travel to Europe and find the lo...,[drama],woman director,What price would you pay for freedom?,6.753,100,2021-03-15,104,ar,/o1wRIwEttuWUByTm1wXsfCstNlh.jpg,6.796
18186,244114,Firestorm,Hong Kong. When Cao Nan and his group of thiev...,"[drama, action, thriller]","robbery, street war, gun battle, rogue cop",,6.165,100,2013-12-12,105,zh,/zKdycnWFGNIALAxEZcTWRtrfWNW.jpg,11.149
18187,603206,Dream Horse,"The inspiring true story of Dream Alliance, an...","[comedy, drama]","wales, sports, horse, racehorse, based on true...",Hearts will race.,7.03,100,2021-05-21,113,en,/uF1mnSdf9EqDIm5XfODAHU6AcWC.jpg,10.19
18190,11083,Kitchen Stories,Swedish efficiency researchers come to Norway ...,"[drama, comedy]","research, friendship, bachelor, cooking, house...",,7.025,100,2004-01-02,95,no,/srIgymX8L1uYlNkRUVbOEYfcGex.jpg,9.637


In [22]:
high_rating_movies["genres"].info()

<class 'pandas.core.series.Series'>
Index: 14637 entries, 0 to 18190
Series name: genres
Non-Null Count  Dtype 
--------------  ----- 
14637 non-null  object
dtypes: object(1)
memory usage: 228.7+ KB


# Before converting to JSON lets understand the relationship between JSON and Pthon

- `JSON object` → Python dict (mapping from string keys to values)

- `JSON array` → Python list

- `JSON string` → Python str

- `JSON numbe`r → Python int or float

- `JSON boolean` → Python bool

- `JSON null` → Python None

In [23]:

# Convert DataFrame rows to a list of dicts
movies_json_list = high_rating_movies.to_dict(orient='records')

In [24]:
#high_rating_movies[5555]

##### now our dict object looks a lot cleaner, so let's conver to `JSON`

In [25]:
import json
with open('movies.json', 'w', encoding='utf-8') as f:
    json.dump(movies_json_list, f, ensure_ascii=False, indent=2) 