**Import libraries and Read Data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import json
import re
import nltk

In [None]:
movies_df = pd.read_csv("source_data/movies_metadata.csv")
credits_df = pd.read_csv("source_data/credits.csv")
links_sm_df = pd.read_csv("source_data/links_small.csv")
links_df = pd.read_csv("source_data/links.csv")
ratings_sm_df = pd.read_csv("source_data/ratings_small.csv")
ratings_df = pd.read_csv("source_data/ratings.csv")
tmdb5000_credits_df  = pd.read_csv("source_data/tmdb_5000_credits.csv")
tmdb5000_movies_df = pd.read_csv("source_data/tmdb_5000_movies.csv")
pd.set_option('display.max_columns', None)

**Delete and Renaming the columns**

In [None]:
# Delete Columns "homepage", "from the movies_df 
movies_df.drop(columns=["homepage", "poster_path"], axis=1, inplace=True)

In [None]:
# Rename all columns from movies_df
movies_df.rename(columns={"adult": "Adult", "belongs_to_collection": "Movie_Collection", "budget": "Budget", "genres": "Genres", "id": "ID", "imdb_id": "IMDB_ID", "original_language": "Original_Language", "original_title": "Original_Title", "overview": "Overview", "popularity": "Popularity", "production_companies": "Production_Companies", "production_countries": "Production_Countries", "release_date": "Release_Date", "revenue": "Revenue", "runtime": "Runtime", "spoken_languages": "Spoken_languages", "status":"Status", "tagline":"Tagline", "title":"Title", "video":"Video", "vote_average":"Ratings", "vote_count":"Voters_Count"}, inplace=True)

In [None]:
print(movies_df.loc[0, 'Genres'])


**Parse JSON into String for Genres**

In [5]:
if "Genres" in movies_df.columns:
    genre = movies_df.Genres.iloc[0]
    genre = genre.replace("\'", "\"")
    y = json.loads(genre)
    print(y[0]["id"])
    for item in y: 
        print(item)

In [None]:
def parse_genres(genre_str):
    genre_str = genre_str.replace("\'", "\"")
    genres = json.loads(genre_str)
    ids = [genre["id"] for genre in genres]
    names = [genre["name"] for genre in genres]
    return pd.Series([ids, names], index=["Id", "Genres"])

movies_df[["Id", "Genres"]] = movies_df["Genres"].apply(parse_genres)



In [None]:
movies_df = movies_df.drop(columns=["Id"])

In [None]:
movies_df.head()

**Parse JSON into String for Movies Collection**

In [None]:
if "Movie_Collection" in movies_df.columns:
    movie_collection = movies_df.Movie_Collection.iloc[0]
    movie_collection = movie_collection.replace("\'", "\"")
    y = json.loads(movie_collection)
    print(y)
    for item in y: 
        print(item)

In [None]:
def parse_collection(collection_str):
    if isinstance(collection_str, str):
        try:
            if collection_str.count("{") > collection_str.count("}"):
                collection_str += "}"
            collection_str = collection_str.replace("\'", "\"")
            collection = json.loads(collection_str)
            if isinstance(collection, dict):
                id = collection.get("id")
                name = collection.get("name")
                poster_path = collection.get("poster_path")
                backdrop_path = collection.get("backdrop_path")
                return pd.Series([id, name, poster_path, backdrop_path], index=["Collection_id", "Collection_Name","Poster_Path", "Backdrop_Path"])
        except (json.JSONDecodeError, TypeError):
            pass
    return pd.Series([None, None, None, None], index=["Collection_id", "Collection_Name", "Poster_Path", "Backdrop_Path"])
    
movies_df[["Collection_id", "Collection_Name","Poster_Path", "Backdrop_Path"]] = movies_df["Movie_Collection"].apply(parse_collection)


In [None]:
movies_df = movies_df.drop(columns=['Movie_Collection', 'Collection_id', 'Poster_Path', 'Backdrop_Path'])


In [None]:
cols = list(movies_df.columns)
cols.insert(1, cols.pop(cols.index('Collection_Name')))
movies_df = movies_df[cols]



In [None]:
movies_df.head()

**Parse JSON into String for Production_Companies**

In [None]:
if "Production_Companies" in movies_df.columns:
    production_companies = movies_df.Production_Companies.iloc[0]
    production_companies = production_companies.replace("\'", "\"")
    y = json.loads(production_companies)
    print(y)
    for item in y: 
        print(item)

In [None]:
def parse_prod_companies(companies_str):
    if isinstance(companies_str, str):
        try:
            companies_str = companies_str.replace("\'", "\"")
            companies = json.loads(companies_str)
            if isinstance(companies, list):
                company_names = [company.get("name") for company in companies]
                company_ids = [company.get("id") for company in companies]
                return pd.Series([company_names,company_ids], index=["Company_Names", "Company_Ids"])
        except (json.JSONDecodeError, TypeError, KeyError) as e:
            pass
    return pd.Series([None, None], index=["Company_Names", "Company_Ids"])
    
movies_df[["Company_Names", "Company_Ids"]] = movies_df["Production_Companies"].apply(parse_prod_companies)


In [None]:
movies_df = movies_df.drop(columns=['Company_Ids', "Production_Companies"])


In [None]:
movies_df.rename(columns={"Company_Names":"Production_Companies"}, inplace=True)

In [None]:
cols = list(movies_df.columns)
cols.insert(11, cols.pop(cols.index('Production_Companies')))
movies_df = movies_df[cols]



In [None]:
movies_df.head()

**Parse JSON into String for Production_Country**

In [None]:
if "Production_Countries" in movies_df.columns:
    production_countries = movies_df.Production_Countries.iloc[0]
    production_countries = production_countries.replace("\'", "\"")
    y = json.loads(production_countries)
    print(y)
    for item in y: 
        print(item)

In [None]:
def parse_prod_countries(countries_str):
    if isinstance(countries_str, str):
        try:
            countries_str = countries_str.replace("\'", "\"")
            countries = json.loads(countries_str)
            if isinstance(countries, list):
                country_names = [country.get("name") for country in countries]
                iso_codes = [country.get("iso") for country in countries]
                return pd.Series([country_names,iso_codes], index=["Country_Names", "ISO_Codes"])
        except (json.JSONDecodeError, TypeError, KeyError) as e:
            pass
    return pd.Series([None, None], index=["Country_Names", "ISO_Codes"])
    
movies_df.loc[:, ['Country_Names', 'ISO_Codes']] = movies_df['Production_Countries'].apply(parse_prod_countries)


In [None]:
movies_df = movies_df.drop(columns=['Production_Countries', 'ISO_Codes', 'ISO_codes'])

In [42]:
movies_df = movies_df.drop(columns=['ISO_codes'])

In [None]:
movies_df.rename(columns={"Country_Names":"Production_Country"}, inplace=True)

In [None]:
cols = list(movies_df.columns)
cols.insert(10, cols.pop(cols.index('Production_Country')))
movies_df = movies_df[cols]

In [43]:
movies_df.head()

Unnamed: 0,Adult,Collection_Name,Budget,Genres,ID,IMDB_ID,Original_Language,Original_Title,Overview,Popularity,Production_Country,Production_Companies,Release_Date,Revenue,Runtime,Spoken_languages,Status,Tagline,Title,Video,Ratings,Voters_Count
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[United States of America],[Pixar Animation Studios],1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,[United States of America],"[TriStar Pictures, Teitler Film, Interscope Co...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,[United States of America],"[Warner Bros., Lancaster Gate]",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[United States of America],[Twentieth Century Fox Film Corporation],1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,Father of the Bride Collection,0,[Comedy],11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,[United States of America],"[Sandollar Productions, Touchstone Pictures]",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


**Parse JSON into String for Spoken Languages**

In [44]:
if "Spoken_languages" in movies_df.columns:
    spoken_languages = movies_df.Spoken_languages.iloc[0]
    spoken_languages = spoken_languages.replace("\'", "\"")
    y = json.loads(spoken_languages)
    print(y)
    for item in y: 
        print(item)

[{'iso_639_1': 'en', 'name': 'English'}]
{'iso_639_1': 'en', 'name': 'English'}


In [45]:
def parse_spoken_languages(languages_str):
    if isinstance(languages_str, str):
        try:
            languages_str = languages_str.replace("\'", "\"")
            languages = json.loads(languages_str)
            if isinstance(languages, list):
                language_name = [language.get("name") for language in languages]
                iso_codes = [language.get("iso") for language in languages]
                return pd.Series([language_name,iso_codes], index=["Language_Name", "ISO_Codes"])
        except (json.JSONDecodeError, TypeError, KeyError) as e:
            pass
    return pd.Series([None, None], index=["Language_Name", "ISO_Codes"])
    
movies_df.loc[:, ['Language_Name', 'ISO_codes']] = movies_df['Spoken_languages'].apply(parse_spoken_languages)


In [46]:
movies_df = movies_df.drop(columns=['Spoken_languages'])
movies_df.rename(columns={"Language_Name":"Spoken_Languages"}, inplace=True)
cols = list(movies_df.columns)
cols.insert(15, cols.pop(cols.index('Spoken_Languages')))
movies_df = movies_df[cols]

In [65]:
movies_df.head(5)

Unnamed: 0,Adult,Collection_Name,Budget,Genres,ID,IMDB_ID,Original_Language,Original_Title,Overview,Popularity,Production_Country,Production_Companies,Release_Date,Revenue,Runtime,Spoken_Languages,Status,Tagline,Title,Video,Ratings,Voters_Count
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[United States of America],[Pixar Animation Studios],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,[United States of America],"[TriStar Pictures, Teitler Film, Interscope Co...",1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,[United States of America],"[Warner Bros., Lancaster Gate]",1995-12-22,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[United States of America],[Twentieth Century Fox Film Corporation],1995-12-22,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,Father of the Bride Collection,0,[Comedy],11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,[United States of America],"[Sandollar Productions, Touchstone Pictures]",1995-02-10,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


**Handling Missing Values**

In [58]:
movies_df = movies_df.drop(columns=['ISO_codes'])

In [59]:
movies_df.isnull().sum()

Adult                       0
Collection_Name         42298
Budget                      0
Genres                      0
ID                          0
IMDB_ID                    17
Original_Language          11
Original_Title              0
Overview                  954
Popularity                  5
Production_Country         11
Production_Companies      490
Release_Date               87
Revenue                     6
Runtime                   263
Spoken_Languages           32
Status                     87
Tagline                 25054
Title                       6
Video                       6
Ratings                     6
Voters_Count                6
dtype: int64

In [63]:
movies_df.dropna(subset=['Collection_Name', "IMDB_ID", "Original_Language", "Overview", "Production_Companies", "Tagline"])

Unnamed: 0,Adult,Collection_Name,Budget,Genres,ID,IMDB_ID,Original_Language,Original_Title,Overview,Popularity,Production_Country,Production_Companies,Release_Date,Revenue,Runtime,Spoken_Languages,Status,Tagline,Title,Video,Ratings,Voters_Count
2,False,Grumpy Old Men Collection,0,"[Romance, Comedy]",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,[United States of America],"[Warner Bros., Lancaster Gate]",1995-12-22,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
4,False,Father of the Bride Collection,0,[Comedy],11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,[United States of America],"[Sandollar Productions, Touchstone Pictures]",1995-02-10,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
9,False,James Bond Collection,58000000,"[Adventure, Action, Thriller]",710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,"[United Kingdom, United States of America]","[United Artists, Eon Productions]",1995-11-16,352194034.0,130.0,"[English, Pусский, Español]",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0
12,False,Balto Collection,0,"[Family, Animation, Adventure]",21032,tt0112453,en,Balto,An outcast half-wolf risks his life to prevent...,12.140733,[United States of America],"[Universal Pictures, Amblin Entertainment, Amb...",1995-12-22,11348324.0,78.0,[English],Released,Part Dog. Part Wolf. All Hero.,Balto,False,7.1,423.0
18,False,Ace Ventura Collection,30000000,"[Crime, Comedy, Adventure]",9273,tt0112281,en,Ace Ventura: When Nature Calls,"Summoned from an ashram in Tibet, Ace finds hi...",8.205448,[United States of America],"[O Entertainment, Warner Bros., Morgan Creek P...",1995-11-10,212385533.0,90.0,[English],Released,New animals. New adventures. Same hair.,Ace Ventura: When Nature Calls,False,6.1,1128.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44924,False,Gor Collection,0,"[Action, Adventure, Science Fiction, Fantasy]",86337,tt0095241,en,Gor,American professor Tarl Cabot is transported v...,1.237557,[United States of America],[Cannon International],1987-05-08,0.0,94.0,"[English, Magyar]",Released,A new dimension in fantasy.,Gor,False,2.9,15.0
45240,False,Schoolgirl Report Collection,0,"[Drama, Romance]",70609,tt0070649,de,Schulmädchen-Report 5. Teil: Was Eltern wirkli...,Seven more first-hand accounts of sexual awake...,3.484932,[Germany],[],1973-04-13,0.0,86.0,[Deutsch],Released,"Young, Willing and Oh So Eager to Please!",Schoolgirl Report Part 5: What All Parents Sho...,False,6.3,6.0
45241,False,Schoolgirl Report Collection,0,"[Comedy, Drama]",36886,tt0069234,de,Schulmädchen-Report 4. Teil: Was Eltern oft ve...,A fake documentary about the sex lives of teen...,2.198993,[Germany],[Rapid Film],1972-09-28,0.0,88.0,[Deutsch],Released,Straight A's all the way ... But not in the cl...,Schoolgirl Report Part 4: What Drives Parents ...,False,4.9,4.0
45353,False,Frankenstein (Hammer Series),0,"[Horror, Science Fiction]",3104,tt0061683,en,Frankenstein Created Woman,A deformed tormented girl drowns herself after...,2.302582,[United Kingdom],[Hammer Film Productions],1967-03-15,0.0,92.0,[English],Released,Now Frankenstein has created a beautiful woman...,Frankenstein Created Woman,False,5.9,33.0


In [64]:
movies_df.isnull().sum()

Adult                       0
Collection_Name         42298
Budget                      0
Genres                      0
ID                          0
IMDB_ID                    17
Original_Language          11
Original_Title              0
Overview                  954
Popularity                  5
Production_Country         11
Production_Companies      490
Release_Date               87
Revenue                     6
Runtime                   263
Spoken_Languages           32
Status                     87
Tagline                 25054
Title                       6
Video                       6
Ratings                     6
Voters_Count                6
dtype: int64

**Sorting values to see potential insights of the dataset**

In [None]:
# Sorting values - Vote Count to create an assumption of which movies are popular
movies_df.sort_values(by=["vote_count"], ascending=[False]).head(4)

In [None]:
movies_df.sort_values(by=["vote_average"], ascending=[False]).head(4)

In [None]:
movies_df["original_language"].value_counts()

**Converting Data type of the columns to the right type**

In [None]:
movies_df.dtypes

In [None]:
movies_df["Budget"] = pd.to_numeric(movies_df["Budget"], errors="coerce")

In [None]:
movies_df.dtypes["Budget"]

**Getting Information about our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column and Memory Requirement**

In [None]:
movies_df.info()

In [None]:
sns.heatmap(movies_df.isnull())

In [None]:
per_missing = movies_df.isnull().sum() * 100 / len(movies_df)
per_missing

**Drop All The Missing Values**

In [None]:
movies_df["Original_Language"].dropna(axis=0)

**Check for Duplicate Data**

In [None]:
dup_data = movies_df.duplicated().any()

In [None]:
print("Are there any duplicates values?", dup_data)

**Get Overall Statistics About The DataFrame**

In [None]:
movies_df.describe(include="all")

**Which language has the highest average Voting**

In [None]:
movies_df.columns

In [None]:
# Present Top Languages
language_counts = movies_df["Original_Language"].value_counts()

# Display the Top 10 results
print("\nTop 10 Languages:")
print(language_counts.head(10))

In [None]:
# Filter for the languages "en", "ger", "fr", "esp"
top_languages = ["en", "de", "fr", "it", "ja", "es"]

# Filter the DataFrame
filtered_df = movies_df[movies_df["Original_Language"].isin(top_languages)]

In [None]:
# Calculate the average ratings for each language
average_ratings = filtered_df.groupby('Original_Language')['Ratings'].mean()
print(average_ratings)

In [None]:
# Create the Bar Plot
plt.figure(figsize=(10,5))
sns.barplot(x="Original_Language", y="Ratings", data=filtered_df)
plt.title("Ratings for selected Languages (en, de, fr, it, ja, es)")
plt.show()

**Create a model to filter Keywords based on ratings**

In [48]:
movies_df.head(2)

Unnamed: 0,Adult,Collection_Name,Budget,Genres,ID,IMDB_ID,Original_Language,Original_Title,Overview,Popularity,Production_Country,Production_Companies,Release_Date,Revenue,Runtime,Spoken_Languages,Status,Tagline,Title,Video,Ratings,Voters_Count,ISO_codes
0,False,Toy Story Collection,30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[United States of America],[Pixar Animation Studios],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,False,7.7,5415.0,
1,False,,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,[United States of America],"[TriStar Pictures, Teitler Film, Interscope Co...",1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,
