In [None]:
# Step 0: imports and Reading Data
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import json

In [None]:
movies_df = pd.read_csv("data/movies_metadata.csv")
credits_df = pd.read_csv("data/credits.csv")
links_sm_df = pd.read_csv("data/links_small.csv")
links_df = pd.read_csv("data/links.csv")
ratings_sm_df = pd.read_csv("data/ratings_small.csv")
ratings_df = pd.read_csv("data/ratings.csv")
tmdb5000_credits_df  = pd.read_csv("data/tmdb_5000_credits.csv")
tmdb5000_movies_df = pd.read_csv("data/tmdb_5000_movies.csv")
pd.set_option('display.max_columns', None)

In [None]:
movies_df.head(4)

In [None]:
# Detect missing values in the movies_df
movies_df.isnull().sum()

In [None]:
# Step 1: Data Understanding
# movies_df.shape                   Return a tuple representing the dimensionality of the DataFrame
# movies_df.head(20)                Shows 20 rows of the DataFrame
# movies_df.dtypes                  This returns a Series with the data type of each column. 
# movies_df.describe()              Generate descriptive statistics.
# movies_df.columns                 Check the columns



In [None]:
# Step 2: Data Preparation
# Rename our columns
# movies_df.rename(columns={"id": "ID"}) # {"Name of column": "Replacement of column"}

# Identify missing values
# movies_df.isna().sum()

# Identify duplicated data
# movies_df.loc[df.duplicated()]

# Check for duplicates with subset
# movies_df.loc[df.duplicated(subset=["original_title"])].head(5)

# Checking an example duplicate
# movies_df.query('original_title == "Cape Fear"')

In [None]:
# Step 3: Feature Understanding

# ax = movies_df["vote_average"].value_counts() \
#     .head(10) \
#     .plot(kind="bar", title="Voters Average")
# ax.set_xlabel("Ratings")
# ax.set_ylabel("Voters Count")

In [None]:
# Step 4: Feauture Relationships

# # sns.pairplot()

movies_df.plot(kind="scatter",
        x="vote_average",
        y = "vote_count",
        title="Voters behavior")
plt.show()

# # sns.scatterplot(x="vote_average",
# #         y = "vote_count",
#         # data=movies_df)      

In [None]:
# Task 1: Transform JSON to String / int
# Task 2: Find relations between the data in the DF
# Task 3: Clean up the data
# Task 4: Visualize the relationships of the data

# The Dataset itself is a mess, very hard to read cause the columns are randomly distributed or placed within the DF
# A lot of missing values / duplicate values
# Figure out what "video" means


In [None]:
# Sorting values - Vote Count to create an assumption of which movies are popular
movies_df.sort_values(by=["vote_count"], ascending=[False]).head(4)

In [None]:
movies_df.sort_values(by=["vote_average"], ascending=[False]).head(4)

In [None]:
movies_df["original_language"].value_counts()

In [None]:
# Delete Columns from the movies_df 
movies_df = movies_df.drop("poster_path", axis=1)

In [28]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [34]:
movies_df["genres"]

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2        [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3        [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                           [{'id': 35, 'name': 'Comedy'}]
                               ...                        
45461    [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...
45462                        [{'id': 18, 'name': 'Drama'}]
45463    [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
45464                                                   []
45465                                                   []
Name: genres, Length: 45466, dtype: object

In [42]:
def parse_json(json_str):
    return json.loads(json_str)

In [45]:
movies_df['parsed_column'] = movies_df['genres'].apply(parse_json)


TypeError: the JSON object must be str, bytes or bytearray, not list

In [None]:
df['parsed_column_as_str'] = df['parsed_column'].apply(lambda x: str(x))
