In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max.columns',None)

In [None]:
links = pd.read_csv('/content/drive/MyDrive/Dataset/tmdb-movie-dataset/links.csv')
ratings = pd.read_csv('/content/drive/MyDrive/Dataset/tmdb-movie-dataset/ratings.csv')
metadata = pd.read_csv('/content/drive/MyDrive/Dataset/tmdb-movie-dataset/movies_metadata.csv')
keywords = pd.read_csv('/content/drive/MyDrive/Dataset/tmdb-movie-dataset/keywords.csv')
credits = pd.read_csv('/content/drive/MyDrive/Dataset/tmdb-movie-dataset/credits.csv')
tags = pd.read_csv('/content/drive/MyDrive/Dataset/tmdb-movie-dataset/tags.csv')

##dropping movieId which missing tbdbId from links, ratings, tags

In [None]:
links.shape, links.head()

((9742, 3),    movieId  imdbId   tmdbId
 0        1  114709    862.0
 1        2  113497   8844.0
 2        3  113228  15602.0
 3        4  114885  31357.0
 4        5  113041  11862.0)

In [None]:
dl_movieid = list(links[np.isnan(links['tmdbId'])]['movieId'].values)
dl_movieid.append(6003)
dl_movieid.append(144606)
dl_movieid

[791, 1107, 2851, 4051, 26587, 32600, 40697, 79299, 6003, 144606]

In [None]:
links.isnull().sum(), links.shape

(movieId    0
 imdbId     0
 tmdbId     8
 dtype: int64, (9742, 3))

In [None]:
ratings.isnull().sum(), ratings.shape

(userId       0
 movieId      0
 rating       0
 timestamp    0
 dtype: int64, (100836, 4))

In [None]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [None]:
links = links[~links['movieId'].isin(dl_movieid)]
links['tmdbId'] = links['tmdbId'].astype('int')
ratings = ratings[~ratings['movieId'].isin(dl_movieid)]
tags = tags[~tags['movieId'].isin(dl_movieid)]

##Modifying metadata

In [None]:
metadata.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [None]:
metadata.dropna(subset=['vote_count'], inplace=True)
metadata.drop(columns=['adult','belongs_to_collection', 'budget', 'homepage', 'imdb_id', 'original_language',
                       'original_title', 'poster_path', 'production_companies', 'production_countries', 'revenue',
                       'runtime', 'spoken_languages', 'status', 'video', 'vote_count'], inplace=True)
metadata.dropna(subset=['release_date', 'overview'], inplace=True)
metadata.drop_duplicates(subset=['id'], keep='last', inplace=True)
metadata['id'] = metadata['id'].astype('int')
metadata['release_date'] = metadata['release_date'].apply(lambda x : int(x[0:4]))
metadata = metadata[metadata['release_date']>1960].reset_index(drop=True) # removing very old movies
metadata['tagline'] = metadata[['tagline']].fillna("")

In [None]:
df = pd.merge(links, metadata, how='inner', left_on='tmdbId', right_on='id')
metadata_col = metadata.columns
metadata = df.loc[:,metadata_col]

In [None]:
metadata.shape, df.shape

((8833, 8), (8833, 11))

In [None]:
unavailable_data_movieid = links[~links['tmdbId'].isin(metadata['id'])]['movieId'].values

In [None]:
metadata.to_csv('/content/drive/MyDrive/Dataset/data/movies_metadata_small.csv',index=False)

##Modifying Keywords

In [None]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [None]:
keywords.shape

(46419, 2)

In [None]:
keywords.isnull().sum()

id          0
keywords    0
dtype: int64

In [None]:
keywords.drop_duplicates(subset=['id'], keep='last', inplace=True)
df1 = pd.merge(df, keywords, how='inner', on='id')
keywords_col = keywords.columns
keywords = df1[keywords_col]
keywords.shape, df1.shape

((8833, 2), (8833, 12))

In [None]:
keywords.to_csv('/content/drive/MyDrive/Dataset/data/keywords_small.csv',index=False)

##Modifying Credit

In [None]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [None]:
credits.isnull().sum()

cast    0
crew    0
id      0
dtype: int64

In [None]:
credits.drop_duplicates(subset=['id'], keep='last', inplace=True)
df2 = pd.merge(df, credits, how='inner', on='id')
credits_col = credits.columns
credits = df2[credits_col]
credits.shape, df2.shape

((8833, 3), (8833, 13))

In [None]:
credits.to_csv('/content/drive/MyDrive/Dataset/data/credits_small.csv',index=False)

##Removing movieId whose corresponding data in TMDB data is not available from links, ratings, tags

In [None]:
links.shape, ratings.shape, tags.shape

((9732, 3), (100807, 4), (3682, 4))

In [None]:
links = links[~(links['movieId'].isin(unavailable_data_movieid))]
ratings = ratings[~(ratings['movieId'].isin(unavailable_data_movieid))]
tags = tags[~(tags['movieId'].isin(unavailable_data_movieid))]
links.shape, ratings.shape, tags.shape

((8833, 3), (96334, 4), (3291, 4))

In [None]:
links.to_csv('/content/drive/MyDrive/Dataset/data/links_small.csv',index=False)
ratings.to_csv('/content/drive/MyDrive/Dataset/data/ratings_small.csv',index=False)
tags.to_csv('/content/drive/MyDrive/Dataset/data/tags_small.csv',index=False)