<a href="https://www.kaggle.com/code/shoaibrkhan/data-expedition-movies?scriptVersionId=144092226" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re
from ast import literal_eval

import warnings

warnings.filterwarnings("ignore")


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
movies = pd.read_csv("/kaggle/input/movies-dataset-for-feature-extracion-prediction/movies.csv")
df = pd.DataFrame(movies)
print(movies.shape)
df

In [None]:
movies.info()

In [None]:
movies.isnull().sum()

# Feature Extraction

In [None]:
# As we can see the Gross column only have 460 non null values from 9539
df.drop('Gross', axis=1, inplace=True)

### Check For Duplicate Movies

In [None]:
df['MOVIES'] = df['MOVIES'].str.strip()
duplicates = df[df.duplicated(subset=['MOVIES'], keep=False)]
duplicates

In [None]:
unique_movies_from_duplicates = duplicates['MOVIES'].unique()
count_unique_movies_from_duplicates = len(unique_movies_from_duplicates)
count_unique_movies_from_duplicates

In [None]:
duplicates_count = df.duplicated(subset=['MOVIES']).sum()
print(f'Total duplicates: {duplicates_count}')

### Removing Duplicates

In [None]:
#as we mentioned above, our movies data got so many duplicate movies, we will remove them
# movies_df = movies_df.sort_values(by=['RunTime', 'YEAR'], na_position='last')
# movies_df = movies_df.drop_duplicates(subset=['MOVIES'], keep='first')

# Identifying rows with non-null values in 'YEAR' or 'RunTime'
mask_non_null = ~df[['RunTime', 'YEAR']].isna().all(axis=1)

# Keeping the first occurrence of each movie title, but prioritize rows with non-null values
df = df.sort_values(by=['YEAR', 'RunTime'], na_position='last')
movies_df = df[~df.duplicated(subset=['MOVIES'], keep='first') & mask_non_null]

movies_df = movies_df.reset_index(drop=True)
movies_df

In [None]:
# now we will do feature Extraction, clean the features in columns that include '\n'
movies_df['GENRE'] = movies_df['GENRE'].str.replace('\n', '')
movies_df['ONE-LINE'] = movies_df['ONE-LINE'].str.replace('\n', '')
movies_df['STARS'] = movies_df['STARS'].str.strip()
movies_df['STARS'] = movies_df['STARS'].str.replace('\n', '')
movies_df

In [None]:
#as we can see our STARS column have both Director and Stars, now we'll split them in separate columns
def extract_names(row):
    directors = ', '.join(re.findall(r'Director[s]*:\s*([^|]+)', row))
    stars = ', '.join(re.findall(r'Stars*:\s*([^|]+)', row))
    return directors, stars

# Applying the function to create separate columns
movies_df[['DIRECTOR', 'STARS']] = movies_df['STARS'].apply(extract_names).apply(pd.Series)

movies_df['DIRECTOR'] = movies_df['DIRECTOR'].str.strip()
movies_df['STARS'] = movies_df['STARS'].str.strip()

movies_df

In [None]:
#Now we will be extracting text of STARS and Directors
movies_df['DIRECTOR'] = movies_df['DIRECTOR'].str.replace('Director:', '')
movies_df['DIRECTOR'] = movies_df['DIRECTOR'].str.replace('Directors:', '')
movies_df['STARS'] = movies_df['STARS'].str.replace('Stars:', '')

movies_df

In [None]:
movies_df.isnull().sum()

### Handling missing values in Votes and Rating with Grouping Director's mean

In [None]:
# df['VOTES'] = pd.to_numeric(df['VOTES'], errors='coerce')
movies_df['VOTES'] = movies_df['VOTES'].str.replace(',', '').astype(float)

mean_ratings = movies_df.groupby('DIRECTOR')['RATING'].transform('mean')
mean_votes = movies_df.groupby('DIRECTOR')['VOTES'].transform('mean')

# Fill missing values in 'RATING' and 'VOTES' columns
movies_df['RATING'].fillna(mean_ratings, inplace=True)
movies_df['VOTES'].fillna(mean_votes, inplace=True)

# Convert 'VOTES' column to float and round
movies_df['VOTES'] = movies_df['VOTES'].round().astype(float)
movies_df['RATING'] = movies_df['RATING'].round().astype(float)

movies_df.isnull().sum()

#### We can add some other dataset related to this one so we can match them with the movies name and fill the missing years as we cannot remove them

In [None]:
movies_df[pd.isna(movies_df['YEAR'])]

In [None]:
imdb_movies = pd.read_csv("/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv")
imdb_df = pd.DataFrame(imdb_movies)
print(imdb_movies.shape)
imdb_movies.head()

In [None]:
#the good thing is that, this data got 1000 movies as well and 1000 non null values in Released Year
#let's see how this dataset can help us
imdb_movies.info()

In [None]:
#removing start and end spaces for movies in both dataset
movies_df['MOVIES'] = movies_df['MOVIES'].str.strip()
imdb_df['Series_Title'] = imdb_df['Series_Title'].str.strip()

In [None]:
movies_df = pd.merge(movies_df, imdb_df[['Series_Title', 'Released_Year', 'Runtime', 'Genre', 'IMDB_Rating', 'No_of_Votes']], left_on='MOVIES', right_on='Series_Title', how='left')
movies_df

In [None]:
#removing min from Runtime col
movies_df['Runtime'] = movies_df['Runtime'].str.replace(' min', '')
movies_df

# genre_info = movies_df[(movies_df['Genre'].notna()) & (movies_df['GENRE'].isna())]
# genre_info

In [None]:
# Defining masks for each condition
masks = [
    movies_df['Genre'].notna() & movies_df['GENRE'].isna(),  # Mask for GENRE
    movies_df['IMDB_Rating'].notna() & movies_df['RATING'].isna(),  # Mask for RATING
    movies_df['No_of_Votes'].notna() & movies_df['VOTES'].isna(),  # Mask for VOTES
    movies_df['Released_Year'].notna() & movies_df['YEAR'].isna(),  # Mask for YEAR
    movies_df['Runtime'].notna() & movies_df['RunTime'].isna()  # Mask for RunTime
]

# Define which columns to fill and which to copy from
columns_to_fill = ['GENRE', 'RATING', 'VOTES', 'YEAR', 'RunTime']  # Columns to fill
columns_to_copy = ['Genre', 'IMDB_Rating', 'No_of_Votes', 'Released_Year', 'Runtime']  # Columns to copy from

# Looping through each condition and performing the fill operation
for mask, col, copy_col in zip(masks, columns_to_fill, columns_to_copy):
    movies_df.loc[mask, col] = movies_df.loc[mask, copy_col]

In [None]:
movies_df.isnull().sum()

In [None]:
#dropping columns which we added
movies_df = movies_df.drop(columns=['Series_Title', 'Released_Year', 'Runtime', 'Genre', 'IMDB_Rating', 'No_of_Votes'])
movies_df

In [None]:
titles = pd.read_csv("/kaggle/input/netflix-tv-shows-and-movies/titles.csv")
titles_df = pd.DataFrame(titles)
print(titles.shape)
titles.head()

In [None]:
titles_df['genres'] = titles_df['genres'].apply(lambda x: ', '.join(eval(x)))
titles_df

In [None]:
titles_df.info()

In [None]:
titles_df['title'] = titles_df['title'].str.strip()
movies_df['MOVIES'] = movies_df['MOVIES'].str.strip()

In [None]:
original_rows = len(movies_df)
movies_df = pd.merge(movies_df, titles_df[['title', 'release_year', 'runtime', 'genres', 'imdb_score', 'imdb_votes']], left_on='MOVIES', right_on='title', how='left')
new_rows = len(movies_df)
movies_df

In [None]:
#we are getting some additional rows from our right dataset so removing it, as we don't want additional data
additional_rows_df = movies_df.iloc[original_rows:new_rows]
mask = movies_df['MOVIES'].isin(additional_rows_df['MOVIES'])
movies_df = movies_df[~mask]
movies_df

In [None]:
#removing the floats in release_year
movies_df['release_year'] = movies_df['release_year'].apply(lambda x: 'NaN' if pd.isna(x) else int(x))
# Convert 'release_year' to object type
movies_df['release_year'] = movies_df['release_year'].astype('object')
movies_df['release_year'] = movies_df['release_year'].replace('NaN', pd.NA)
movies_df

In [None]:
# Define masks for each condition
masks = [
    movies_df['genres'].notna() & movies_df['GENRE'].isna(),  # Mask for GENRE
    movies_df['imdb_score'].notna() & movies_df['RATING'].isna(),  # Mask for RATING
    movies_df['imdb_votes'].notna() & movies_df['VOTES'].isna(),  # Mask for VOTES
    movies_df['runtime'].notna() & movies_df['RunTime'].isna(),  # Mask for RunTime
    movies_df['release_year'].notna() & movies_df['YEAR'].isna()  # Mask for YEAR
]

# Define which columns to fill and which to copy from
columns_to_fill = ['GENRE', 'RATING', 'VOTES', 'RunTime', 'YEAR']  # Columns to fill
columns_to_copy = ['genres', 'imdb_score', 'imdb_votes', 'runtime', 'release_year']  # Columns to copy from

# Loop through each condition and perform the fill operation
for mask, col, copy_col in zip(masks, columns_to_fill, columns_to_copy):
    movies_df.loc[mask, col] = movies_df.loc[mask, copy_col]

In [None]:
movies_df.info()

In [None]:
movies_df.isnull().sum()

In [None]:
#dropping columns which we added for titles dataset
movies_df = movies_df.drop(columns=['title', 'release_year', 'runtime', 'genres' ,'imdb_score', 'imdb_votes'])
movies_df

In [None]:
# Calculate overall mean for RATING and VOTES
mean_rating = movies_df['RATING'].mean()
mean_votes = movies_df['VOTES'].mean()

# Fill missing values
movies_df['RATING'].fillna(mean_rating, inplace=True)
movies_df['VOTES'].fillna(mean_votes, inplace=True)

#also we are filling non null values of GENRE with 'Unknown'
movies_df['GENRE'] = movies_df['GENRE'].fillna('Unknown')
movies_df

In [None]:
movies_df.isnull().sum()

#### We successfully replaced some of the years from our dataset which accurately matched by MOVIES
#### Now we also can use some fuzzy techniques as well here to match more items from MOVIES but that would be more tricky

In [None]:
movies_df.isnull().sum()

In [None]:
nan_years = movies_df[movies_df['YEAR'].isnull() | (movies_df['YEAR'] == '')]
nan_years

In [None]:
# checking duplicate again
duplicate_rows = movies_df[movies_df.duplicated(subset=['MOVIES'], keep=False)]
duplicate_rows

In [None]:
#we will now simply drop_duplicate those entries
movies_df['MOVIES'] = movies_df['MOVIES'].str.strip()
movies_df = movies_df.drop_duplicates(subset=['MOVIES'], keep='first')
#checking again
duplicate_rows = movies_df[movies_df.duplicated(subset=['MOVIES'], keep=False)]
duplicate_rows

In [None]:
movie_counts = movies_df['MOVIES'].value_counts()
movie_counts_df = movie_counts.reset_index()
movie_counts_df.columns = ['Movie', 'Count']
movie_counts_df.head(10)

In [None]:
movies_df.isnull().sum()

In [None]:
nan_years = movies_df[movies_df['YEAR'].isnull()]
nan_years

In [None]:
# for now we are replacing all null values in RunTime to with grouping Director's mean
# as we've got 500+ NaN in Runtime, we cannot fill it with 0.

movies_df['RunTime'] = pd.to_numeric(movies_df['RunTime'], errors='coerce')

# Calculate mean runtime by director
mean_runtime = movies_df.groupby('DIRECTOR')['RunTime'].transform('mean')
mean_runtime

# Fill missing values in 'RATING' and 'VOTES' columns
movies_df['RunTime'].fillna(mean_ratings, inplace=True)

# Convert 'VOTES' column to float and round
movies_df['RunTime'] = movies_df['RunTime'].round().astype(float)

movies_df.isnull().sum()

#### if still getting null values in YEAR and RunTime we can handle with the below

In [None]:
# and for year, we can assign it with to some outlier
movies_df['YEAR'].fillna("(2099)", inplace=True)

#and for Runtime we here got few null values as well, for now I am assigning them to mean of all
movies_df['RunTime'].fillna(movies_df['RunTime'].mean(), inplace=True)

In [None]:
#rounding off Ratings, Runtime and Votes
movies_df['RATING'] = movies_df['RATING'].round().astype(int)
movies_df['VOTES'] = movies_df['VOTES'].round().astype(int)
movies_df['RunTime'] = movies_df['RunTime'].round().astype(int)
movies_df

In [None]:
movies_df.isnull().sum()

### Adding Type Column where we can identify a type of movie as Movie or Show

In [None]:
# a regular expression pattern to match the desired format of YEAR
pattern = r'\(\d{4}(-\d{4})?\)|\d{4}'

# a function to classify based on the pattern
def classify_movie_or_show(year):
    if pd.notna(year) and pd.Series(year).astype(str).str.match(pattern).any():
        return 'Movie'
    else:
        return 'Show'

movies_df['Type'] = movies_df['YEAR'].apply(classify_movie_or_show)
movies_df

In [None]:
# I am reassigning values to Type and some Genre have Drama, Documentary or Show included
def update_type(row):
    genres = [genre.strip() for genre in row['GENRE'].split(',')]
    if 'Drama' in genres or 'drama' in genres or 'Documentary' in genres or 'Show' in genres:
        return 'Show'
    return row['Type']

movies_df['Type'] = movies_df.apply(update_type, axis=1)
movies_df

In [None]:
#We are not ending it here, let's look into the outliers for our Runtime column
#we are using boxplot to check outliers
movies_runtime = movies_df[movies_df['Type'] == 'Movie']
# movies_runtime
plt.figure(figsize=(10, 5))
plt.boxplot(movies_runtime['RunTime'], vert=False, patch_artist=True)
plt.title('Boxplot of RunTime')
plt.xlabel('RunTime')
plt.show()

In [None]:
movie_runt = movies_df[(movies_df['RunTime'] < 20) & (movies_df['Type'] == 'Movie')]
movie_runt

In [None]:
#There may not have movies with less than 20 minutes so we are finding out mean grouping by Type of Movie
# Calculate the mean runtime for each movie type
mean_runtimes_by_type = movies_df.groupby('Type')['RunTime'].transform('mean')

# Replace runtimes less than 20 with the mean for that movie type
movies_df.loc[(movies_df['RunTime'] < 20) & (movies_df['Type'] == 'Movie'), 'RunTime'] = mean_runtimes_by_type
movies_df['RunTime'] = movies_df['RunTime'].round().astype(int)
movies_df

# Exploratory Data Analysis

### Highest Rating

In [None]:
top_10_movies = movies_df.sort_values(by='RATING', ascending=False).head(10)
# top_10_movies = movies_df.nlargest(10, 'RATING')
top_10_movies

### Most Votes on Movies

In [None]:
new_df = movies_df.copy()
top_n_rows = new_df.nlargest(10, 'VOTES')
top_n_rows['VOTES'] = new_df['VOTES'].apply(lambda x: format(x, ','))
top_n_rows

## Finding Most Rated Movies By Year

In [None]:
#Removing the leading or trailing spaces from the genre 
movies_df['GENRE'] = movies_df['GENRE'].str.strip()
#separting Genre cell by commas
unique_genres = movies_df['GENRE'].str.split(', ').explode()
genre_counts = unique_genres.value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']
genre_counts

### Most Movies by Director

In [None]:
unique_genres = movies_df['DIRECTOR'].str.split(', ').explode()
director_counts = unique_genres.value_counts().reset_index()
director_counts.columns = ['DIRECTOR', 'Count']
director_counts = director_counts[director_counts['DIRECTOR'] != '']
director_counts.head()

### Stars Appeared the Most

In [None]:
unique_genres = movies_df['STARS'].str.split(', ').explode()
stars_counts = unique_genres.value_counts().reset_index()
stars_counts.columns = ['STARS', 'Count']
stars_counts = stars_counts[stars_counts['STARS'] != '']
stars_counts.head()

In [None]:
movies_df.isnull().sum()

In [None]:
#getting yearly highest rating movies
specific_year_data = movies_df[movies_df['YEAR'] == "(2020)"].sort_values(by='RATING', ascending=False).head(10)
specific_year_data

# Data Visualization

In [None]:
sns.histplot(movies_df['RATING'])
plt.xlabel('RATING')
plt.title('Distribution of RATING')
plt.show()

In [None]:
df_year = movies_df.copy()
df_year['RELEASE_YEAR_START'] = df_year['YEAR'].str.extract(r'(\d{4})')[0]
df_year['RELEASE_YEAR_END'] = df_year['YEAR'].str.extract(r'(\d{4})–(\d{4})')[1]

# Fill NaN values in RELEASE_YEAR_START and RELEASE_YEAR_END with '0' (for movies with single years)
df_year['RELEASE_YEAR_START'].fillna('0', inplace=True)
df_year['RELEASE_YEAR_END'].fillna('0', inplace=True)

df_year = df_year[(df_year['RELEASE_YEAR_START'].astype(int) >= 2000) & (df_year['RELEASE_YEAR_END'].astype(int) <= 2023)]

plt.figure(figsize=(10, 6))
sns.countplot(data=df_year, x='RELEASE_YEAR_START', palette='Blues_d')
plt.title('Number of Movies per Year', fontsize=16)
plt.xlabel('Release Year', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=top_10_movies, x='RATING', y='MOVIES', palette='Blues_d')
plt.title(f'Top Highest-Rated Movies', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Movie Title', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
lineplot_df = movies_df.copy()

lineplot_df['YEAR'] = lineplot_df['YEAR'].str.extract(r'(\d{4})', expand=False)

# Convert years to integers
lineplot_df['YEAR'] = lineplot_df['YEAR'].astype(float).astype('Int64')

lineplot_df = lineplot_df[lineplot_df['YEAR'] <= 2023]

# Drop rows with missing years
lineplot_df = lineplot_df.dropna(subset=['YEAR'])

# Group by 'YEAR' and count the number of movies in each year
movies_per_year = lineplot_df['YEAR'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
plt.plot(movies_per_year.index, movies_per_year.values, marker='o')
plt.title('Number of Movies Released Every Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.boxplot(movies_df['RATING'], vert=False, patch_artist=True)
plt.title('Boxplot of Ratings')
plt.xlabel('Rating')
plt.show()

In [None]:
type_counts = movies_df['Type'].value_counts()
genre_counts = movies_df['GENRE'].str.split(',').explode().str.strip().value_counts()

# Creating the figure and the outer pie chart
fig, ax = plt.subplots()

# Data for the outer pie chart
ax.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=180, colors=['#66b3ff','#99ff99'])

# To draw a white circle at the center
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig.gca().add_artist(centre_circle)

# Creating the subplot inside the pie chart
sub_ax = fig.add_axes([0.3, 0.3, 0.4, 0.4])
sub_ax.pie(genre_counts[:5], labels=genre_counts.index[:5], autopct='%1.1f%%', startangle=180, colors=['#ff9999','#66b3ff','#99ff99'])

# Equal aspect ratio ensures that pie is drawn as a circle.
ax.axis('equal')
sub_ax.axis('equal')

# Add title to the subplot
ax.set_title('Movies Analysis with the Type and Genre')

# Show the plot
plt.show()

In [None]:
genre_counts = movies_df['GENRE'].value_counts().nlargest(20)

# Create a bar plot
plt.figure(figsize=(10,6))
genre_counts.plot(kind='bar')
plt.title('Genre Distribution in Movies')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.show()

In [None]:
top_10_low_movies = movies_df.sort_values(by='RATING', ascending=False).tail(10)
plt.figure(figsize=(10, 6))
sns.barplot(data=top_10_low_movies, x='RATING', y='MOVIES', palette='Blues_d')
plt.title(f'Lowers-Rated Movies', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Movie Title', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
sns.violinplot(x='Type', y='RATING', data=movies_df)

In [None]:
numeric_df = movies_df[['RATING', 'RunTime', 'VOTES']]
cov_matrix = numeric_df.cov()
cov_matrix

In [None]:
sns.heatmap(cov_matrix, annot=True, fmt='g')
plt.title('Covariance Matrix Heatmap')
plt.show()

In [None]:
sns.set(style="ticks")
sns.pairplot(movies_df, hue='Type', kind='reg')

In [None]:
#FacetGrid trying out FacetGrid for the subplotting our RATING and VOTES
g = sns.FacetGrid(movies_df, col="Type")
g.map(plt.scatter, "RATING", "VOTES")
g.set_axis_labels("Rating", "Votes")
g.set_titles(col_template="{col_name}")
plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=16)
plt.show()