<a href="https://www.kaggle.com/code/shoaibrkhan/data-expedition-movies?scriptVersionId=143682592" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re
from ast import literal_eval

import warnings

warnings.filterwarnings("ignore")


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
movies = pd.read_csv("/kaggle/input/movies-dataset-for-feature-extracion-prediction/movies.csv")
df = pd.DataFrame(movies)
print(movies.shape)
df

In [None]:
movies.info()

In [None]:
movies.isnull().sum()

# Feature Extraction

In [None]:
# As we can see the Gross column only have 460 non null values from 9539
df.drop('Gross', axis=1, inplace=True)

### Check For Duplicate Movies

In [None]:
df['MOVIES'] = df['MOVIES'].str.strip()
duplicateData = df[df.duplicated(keep=False)]
duplicateData

In [None]:
#we'll handle this dublicate movies later
duplicateData['MOVIES'].unique()

In [None]:
# now we will do feature Extraction, clean the features in columns that include '\n'
df['GENRE'] = df['GENRE'].str.replace('\n', '')
df['ONE-LINE'] = df['ONE-LINE'].str.replace('\n', '')
df['STARS'] = df['STARS'].str.strip()
df['STARS'] = df['STARS'].str.replace('\n', '')
df

In [None]:
#as we can see our STARS column have both Director and Stars, now we'll split them in separate columns
def extract_names(row):
    directors = ', '.join(re.findall(r'Director[s]*:\s*([^|]+)', row))
    stars = ', '.join(re.findall(r'Stars*:\s*([^|]+)', row))
    return directors, stars

# Applying the function to create separate columns
df[['DIRECTOR', 'STARS']] = df['STARS'].apply(extract_names).apply(pd.Series)

df['DIRECTOR'] = df['DIRECTOR'].str.strip()
df['STARS'] = df['STARS'].str.strip()

df

In [None]:
#Now we will be extracting text of STARS and Directors
df['DIRECTOR'] = df['DIRECTOR'].str.replace('Director:', '')
df['DIRECTOR'] = df['DIRECTOR'].str.replace('Directors:', '')
df['STARS'] = df['STARS'].str.replace('Stars:', '')

df

In [None]:
df.isnull().sum()

### Handling missing values in Votes and Rating with Grouping Director's mean

In [None]:

# df['VOTES'] = pd.to_numeric(df['VOTES'], errors='coerce')
df['VOTES'] = df['VOTES'].str.replace(',', '').astype(float)

mean_ratings = df.groupby('DIRECTOR')['RATING'].transform('mean')
mean_votes = df.groupby('DIRECTOR')['VOTES'].transform('mean')

# Fill missing values in 'RATING' and 'VOTES' columns
df['RATING'].fillna(mean_ratings, inplace=True)
df['VOTES'].fillna(mean_votes, inplace=True)

# Convert 'VOTES' column to float and round
df['VOTES'] = df['VOTES'].round().astype(float)
df['RATING'] = df['RATING'].round().astype(float)

df.isnull().sum()

#### We can add some other dataset related to this one so we can match them with the movies name and fill the missing years as we cannot remove them

In [None]:
df[pd.isna(df['YEAR'])]

In [None]:
imdb_movies = pd.read_csv("/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv")
imdb_df = pd.DataFrame(imdb_movies)
print(imdb_movies.shape)
imdb_movies.head()

In [None]:
#the good thing is that, this data got 1000 movies as well and 1000 non null values in Released Year
#let's see how this dataset can help us
imdb_movies.info()

In [None]:
#removing start and end spaces for movies in both dataset
df['MOVIES'] = df['MOVIES'].str.strip()
imdb_df['Series_Title'] = imdb_df['Series_Title'].str.strip()

In [None]:
final_df = pd.merge(df, imdb_df[['Series_Title', 'Released_Year', 'Runtime', 'Genre', 'IMDB_Rating', 'No_of_Votes']], left_on='MOVIES', right_on='Series_Title', how='left')
final_df

In [None]:
#removing min from Runtime col
final_df['Runtime'] = final_df['Runtime'].str.replace(' min', '')
final_df

# genre_info = final_df[(final_df['Genre'].notna()) & (final_df['GENRE'].isna())]
# genre_info

In [None]:
# Defining masks for each condition
masks = [
    final_df['Genre'].notna() & final_df['GENRE'].isna(),  # Mask for GENRE
    final_df['IMDB_Rating'].notna() & final_df['RATING'].isna(),  # Mask for RATING
    final_df['No_of_Votes'].notna() & final_df['VOTES'].isna(),  # Mask for VOTES
    final_df['Released_Year'].notna() & final_df['YEAR'].isna(),  # Mask for YEAR
    final_df['Runtime'].notna() & final_df['RunTime'].isna()  # Mask for RunTime
]

# Define which columns to fill and which to copy from
columns_to_fill = ['GENRE', 'RATING', 'VOTES', 'YEAR', 'RunTime']  # Columns to fill
columns_to_copy = ['Genre', 'IMDB_Rating', 'No_of_Votes', 'Released_Year', 'Runtime']  # Columns to copy from

# Looping through each condition and performing the fill operation
for mask, col, copy_col in zip(masks, columns_to_fill, columns_to_copy):
    final_df.loc[mask, col] = final_df.loc[mask, copy_col]

In [None]:
final_df.isnull().sum()

In [None]:
#dropping columns which we added
final_df = final_df.drop(columns=['Series_Title', 'Released_Year', 'Runtime', 'Genre', 'IMDB_Rating', 'No_of_Votes'])
final_df

In [None]:
titles = pd.read_csv("/kaggle/input/netflix-tv-shows-and-movies/titles.csv")
titles_df = pd.DataFrame(titles)
print(titles.shape)
titles.head()

In [None]:
titles_df['genres'] = titles_df['genres'].apply(lambda x: ', '.join(eval(x)))
titles_df

In [None]:
titles_df.info()

In [None]:
titles_df['title'] = titles_df['title'].str.strip()
final_df['MOVIES'] = final_df['MOVIES'].str.strip()

In [None]:
final_df = pd.merge(final_df, titles_df[['title', 'release_year', 'runtime', 'genres', 'imdb_score', 'imdb_votes']], left_on='MOVIES', right_on='title', how='left')
final_df

In [None]:
#removing the floats in release_year
final_df['release_year'] = final_df['release_year'].apply(lambda x: 'NaN' if pd.isna(x) else int(x))
# Convert 'release_year' to object type
final_df['release_year'] = final_df['release_year'].astype('object')
final_df['release_year'] = final_df['release_year'].replace('NaN', pd.NA)
final_df

In [None]:
# Define masks for each condition
masks = [
    final_df['genres'].notna() & final_df['GENRE'].isna(),  # Mask for GENRE
    final_df['imdb_score'].notna() & final_df['RATING'].isna(),  # Mask for RATING
    final_df['imdb_votes'].notna() & final_df['VOTES'].isna(),  # Mask for VOTES
    final_df['runtime'].notna() & final_df['RunTime'].isna(),  # Mask for RunTime
    final_df['release_year'].notna() & final_df['YEAR'].isna()  # Mask for YEAR
]

# Define which columns to fill and which to copy from
columns_to_fill = ['GENRE', 'RATING', 'VOTES', 'RunTime', 'YEAR']  # Columns to fill
columns_to_copy = ['genres', 'imdb_score', 'imdb_votes', 'runtime', 'release_year']  # Columns to copy from

# Loop through each condition and perform the fill operation
for mask, col, copy_col in zip(masks, columns_to_fill, columns_to_copy):
    final_df.loc[mask, col] = final_df.loc[mask, copy_col]

In [None]:
final_df.info()

In [None]:
final_df.isnull().sum()

In [None]:
#dropping columns which we added for titles dataset
final_df = final_df.drop(columns=['title', 'release_year', 'runtime', 'genres' ,'imdb_score', 'imdb_votes'])
final_df

In [None]:
# Calculate overall mean for RATING and VOTES
mean_rating = final_df['RATING'].mean()
mean_votes = final_df['VOTES'].mean()

# Fill missing values
final_df['RATING'].fillna(mean_rating, inplace=True)
final_df['VOTES'].fillna(mean_votes, inplace=True)

#also we are filling non null values of GENRE with 'Unknown'
final_df['GENRE'] = final_df['GENRE'].fillna('Unknown')
final_df

In [None]:
final_df.isnull().sum()

#### We successfully replaced some of the years from our dataset which accurately matched by MOVIES
#### Now we also can use some fuzzy techniques as well here to match more items from MOVIES but that would be more tricky

### Removing Duplicates

In [None]:
#as we mentioned above, our movies data got so many duplicate movies, we will remove them
# Remove duplicates
# unique_df = final_df.drop_duplicates()
# unique_df

# a mask to identify rows where both 'YEAR' and 'Runtime' are NaN
mask_nan = final_df[['YEAR', 'RunTime']].isna().all(axis=1)

# a mask to keep rows that are not identified as NaN in both 'YEAR' and 'Runtime', 
# or if they are duplicate entries in the 'Movies' column (keeping the first occurrence)
mask_keep = ~mask_nan | final_df.duplicated(subset=['MOVIES'], keep='first')

# Apply the 'mask_keep' filter to the DataFrame
unique_df = final_df[mask_keep]

# Reset the index to ensure it reflects the updated DataFrame
unique_df = unique_df.reset_index(drop=True)
unique_df

In [None]:
unique_df.isnull().sum()

In [None]:
nan_years = unique_df[unique_df['YEAR'].isnull() | (unique_df['YEAR'] == '')]
nan_years

In [None]:
# checking duplicate again
duplicate_rows = unique_df[unique_df.duplicated(subset=['MOVIES'], keep=False)]
duplicate_rows

In [None]:
#we will now simply drop_duplicate those entries
unique_df['MOVIES'] = unique_df['MOVIES'].str.strip()
unique_df = unique_df.drop_duplicates(subset=['MOVIES'], keep='first')
#checking again
duplicate_rows = unique_df[unique_df.duplicated(subset=['MOVIES'], keep=False)]
duplicate_rows

In [None]:
movie_counts = unique_df['MOVIES'].value_counts()
movie_counts_df = movie_counts.reset_index()
movie_counts_df.columns = ['Movie', 'Count']
movie_counts_df.head(10)

In [None]:
unique_df.isnull().sum()

In [None]:
nan_years = unique_df[unique_df['YEAR'].isnull()]
nan_years

In [None]:
# for now we are replacing all null values in RunTime to with grouping Director's mean
# as we've got 500+ NaN in Runtime, we cannot fill it with 0.

unique_df['RunTime'] = pd.to_numeric(unique_df['RunTime'], errors='coerce')

# Calculate mean runtime by director
mean_runtime = unique_df.groupby('DIRECTOR')['RunTime'].transform('mean')
mean_runtime

# Fill missing values in 'RATING' and 'VOTES' columns
unique_df['RunTime'].fillna(mean_ratings, inplace=True)

# Convert 'VOTES' column to float and round
unique_df['RunTime'] = unique_df['RunTime'].round().astype(float)

unique_df.isnull().sum()

In [None]:
# and for year we are only getting few null values in years which we can assign to some outlier
unique_df['YEAR'].fillna("(2099)", inplace=True)

#and for Runtime we here got few null values as well, for now I am assigning them to mean of all
unique_df['RunTime'].fillna(unique_df['RunTime'].mean(), inplace=True)

In [None]:
#rounding off Ratings, Runtime and Votes
unique_df['RATING'] = unique_df['RATING'].round().astype(int)
unique_df['VOTES'] = unique_df['VOTES'].round().astype(int)
unique_df['RunTime'] = unique_df['RunTime'].round().astype(int)
unique_df

In [None]:
unique_df.isnull().sum()

### Adding Type Column where we can identify a type of movie as Movie or Show

In [None]:
# a regular expression pattern to match the desired format of YEAR
pattern = r'\(\d{4}(-\d{4})?\)|\d{4}'

# a function to classify based on the pattern
def classify_movie_or_show(year):
    if pd.notna(year) and pd.Series(year).astype(str).str.match(pattern).any():
        return 'Movie'
    else:
        return 'Show'

unique_df['Type'] = unique_df['YEAR'].apply(classify_movie_or_show)
unique_df

In [None]:
# I am reassigning values to Type and some Genre have Drama, Documentary or Show included
def update_type(row):
    genres = [genre.strip() for genre in row['GENRE'].split(',')]
    if 'Drama' in genres or 'drama' in genres or 'Documentary' in genres or 'Show' in genres:
        return 'Show'
    return row['Type']

unique_df['Type'] = unique_df.apply(update_type, axis=1)
unique_df

In [None]:
#We are not ending it here, let's look into the outliers for our Runtime column
#we are using boxplot to check outliers
movies_runtime = unique_df[unique_df['Type'] == 'Movie']
# movies_runtime
plt.figure(figsize=(10, 5))
plt.boxplot(movies_runtime['RunTime'], vert=False, patch_artist=True)
plt.title('Boxplot of RunTime')
plt.xlabel('RunTime')
plt.show()

In [None]:
movie_runt = unique_df[(unique_df['RunTime'] < 20) & (unique_df['Type'] == 'Movie')]
movie_runt

In [None]:
#There may not have movies with less than 20 minutes so we are finding out mean grouping by Type of Movie
# Calculate the mean runtime for each movie type
mean_runtimes_by_type = unique_df.groupby('Type')['RunTime'].transform('mean')

# Replace runtimes less than 20 with the mean for that movie type
unique_df.loc[(unique_df['RunTime'] < 20) & (unique_df['Type'] == 'Movie'), 'RunTime'] = mean_runtimes_by_type
unique_df['RunTime'] = unique_df['RunTime'].round().astype(int)
unique_df

# Exploratory Data Analysis

### Highest Rating

In [None]:
top_10_movies = unique_df.sort_values(by='RATING', ascending=False).head(10)
# top_10_movies = unique_df.nlargest(10, 'RATING')
top_10_movies

### Most Votes on Movies

In [None]:
new_df = unique_df.copy()
top_n_rows = new_df.nlargest(10, 'VOTES')
top_n_rows['VOTES'] = new_df['VOTES'].apply(lambda x: format(x, ','))
top_n_rows

## Finding Most Rated Movies By Year

In [None]:
#Removing the leading or trailing spaces from the genre 
unique_df['GENRE'] = unique_df['GENRE'].str.strip()
#separting Genre cell by commas
unique_genres = unique_df['GENRE'].str.split(', ').explode()
genre_counts = unique_genres.value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']
genre_counts

### Most Movies by Director

In [None]:
unique_genres = unique_df['DIRECTOR'].str.split(', ').explode()
director_counts = unique_genres.value_counts().reset_index()
director_counts.columns = ['DIRECTOR', 'Count']
director_counts = director_counts[director_counts['DIRECTOR'] != '']
director_counts.head()

### Stars Appeared the Most

In [None]:
unique_genres = unique_df['STARS'].str.split(', ').explode()
stars_counts = unique_genres.value_counts().reset_index()
stars_counts.columns = ['STARS', 'Count']
stars_counts = stars_counts[stars_counts['STARS'] != '']
stars_counts.head()

In [None]:
unique_df.isnull().sum()

In [None]:
#getting yearly highest rating movies
specific_year_data = unique_df[unique_df['YEAR'] == "(2020)"].sort_values(by='RATING', ascending=False).head(10)
specific_year_data

# Data Visualization

In [None]:
sns.histplot(unique_df['RATING'])
plt.xlabel('RATING')
plt.title('Distribution of RATING')
plt.show()

In [None]:
df_year = unique_df.copy()
df_year['RELEASE_YEAR_START'] = df_year['YEAR'].str.extract(r'(\d{4})')[0]
df_year['RELEASE_YEAR_END'] = df_year['YEAR'].str.extract(r'(\d{4})–(\d{4})')[1]

# Fill NaN values in RELEASE_YEAR_START and RELEASE_YEAR_END with '0' (for movies with single years)
df_year['RELEASE_YEAR_START'].fillna('0', inplace=True)
df_year['RELEASE_YEAR_END'].fillna('0', inplace=True)

df_year = df_year[(df_year['RELEASE_YEAR_START'].astype(int) >= 2000) & (df_year['RELEASE_YEAR_END'].astype(int) <= 2023)]

plt.figure(figsize=(10, 6))
sns.countplot(data=df_year, x='RELEASE_YEAR_START', palette='viridis')
plt.title('Number of Movies per Year', fontsize=16)
plt.xlabel('Release Year', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=top_10_movies, x='RATING', y='MOVIES', palette='viridis')
plt.title(f'Top Highest-Rated Movies', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Movie Title', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
lineplot_df = unique_df.copy()

lineplot_df['YEAR'] = lineplot_df['YEAR'].str.extract(r'(\d{4})', expand=False)

# Convert years to integers
lineplot_df['YEAR'] = lineplot_df['YEAR'].astype(float).astype('Int64')

lineplot_df = lineplot_df[lineplot_df['YEAR'] <= 2023]

# Drop rows with missing years
lineplot_df = lineplot_df.dropna(subset=['YEAR'])

# Group by 'YEAR' and count the number of movies in each year
movies_per_year = lineplot_df['YEAR'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
plt.plot(movies_per_year.index, movies_per_year.values, marker='o')
plt.title('Number of Movies Released Every Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.boxplot(unique_df['RATING'], vert=False, patch_artist=True)
plt.title('Boxplot of Ratings')
plt.xlabel('Rating')
plt.show()

In [None]:
type_counts = unique_df['Type'].value_counts()

# Create a pie chart
plt.figure(figsize=(5, 5))
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#99ff99'])
plt.title('Distribution of Movies by Type')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Show the pie chart
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
genre_counts.plot(kind='barh', color='skyblue')
plt.title('Genres in Movies', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Genre', fontsize=14)
plt.gca().invert_yaxis()
plt.show()

In [None]:
top_10_low_movies = unique_df.sort_values(by='RATING', ascending=False).tail(10)
plt.figure(figsize=(10, 6))
sns.barplot(data=top_10_low_movies, x='RATING', y='MOVIES', palette='viridis')
plt.title(f'Lowers-Rated Movies', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Movie Title', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
numeric_df = unique_df[['RATING', 'RunTime', 'VOTES']]
cov_matrix = numeric_df.cov()
cov_matrix

In [None]:
sns.heatmap(cov_matrix, annot=True, fmt='g')
plt.title('Covariance Matrix Heatmap')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

axs[0, 0].scatter(numeric_df['RATING'], numeric_df['RunTime'], color='r')
axs[0, 0].set_xlabel('RATING')
axs[0, 0].set_ylabel('RunTime')
axs[0, 0].set_title('RATING vs RunTime')

axs[0, 1].scatter(numeric_df['RATING'], numeric_df['VOTES'], color='g')
axs[0, 1].set_xlabel('RATING')
axs[0, 1].set_ylabel('VOTES')
axs[0, 1].set_title('RATING vs VOTES')

axs[1, 0].scatter(numeric_df['RunTime'], numeric_df['VOTES'], color='b')
axs[1, 0].set_xlabel('RunTime')
axs[1, 0].set_ylabel('VOTES')
axs[1, 0].set_title('RunTime vs VOTES')

axs[1, 1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
sns.set(style="ticks")
sns.pairplot(unique_df, hue='Type')

In [None]:
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=16)
plt.show()