### Exploratory Data Analysis

In [None]:
'''
Reading dataset
Analyzing the data
Checking for the duplicates
Missing Values Calculation
Exploratory Data Analysis
    Univariate Analysis
    Bivariate Analysis
    Multivariate Analysis
'''

In [None]:
# Loading Data 

# Understanding the Structure
#  df.head(), df.tail(), df.info(), df.dtypes, df.shape

# Summary Statistics
# df.describe(), df.isnull().sum(), df.nunique()

# Data Cleaning
# df.dropna(), df.fillna(), df.apply(), df.replace()

# Data Distribution
# df.hist(), df.boxplot(),df.plot.kde()

# Data Visualization
# plt.plot(), plt.scatter() , sns.pairplot(), sns.heatmap()

# Correlation Analysis
# df.corr(), sns.heatmap(df.corr())

# Categorical Data Analysis
# df['column'].value_counts(), sns.barplot()

# Data Aggregation
# df.groupby(), df.pivot_table()

# Advanced Statistical Analysis
# scipy.stats, statsmodels

# Exploring Relationships
# pd.crosstab()
# Chi-square Tests: For categorical variables

In [None]:
import os
os.chdir(r'C:\Users\srvna\Downloads\archive')
import pandas as pd
import matplotlib.pyplot as plt

# Loading the dataset

In [None]:
credits_data = {
    'id': [1, 2, 3, 4, 5, 6],
    'cast': [
        'Robert Downey Jr.|Chris Evans|Mark Ruffalo|Scarlett Johansson',
        'Leonardo DiCaprio|Kate Winslet|Billy Zane',
        'Tom Hanks|Robin Wright|Gary Sinise|Mykelti Williamson',
        'Will Smith|Margot Robbie|Jared Leto|Viola Davis',
        'Harrison Ford|Carrie Fisher|Mark Hamill|Peter Mayhew',
        'Keanu Reeves|Laurence Fishburne|Carrie-Anne Moss|Hugo Weaving'
    ],
    'crew': [
        'Jon Favreau|Kevin Feige|Robert Downey Jr.',
        'James Cameron|Jon Landau|Leonardo DiCaprio',
        'Robert Zemeckis|Steve Starkey|Tom Hanks',
        'David Ayer|Zack Snyder|Will Smith',
        'George Lucas|Irvin Kershner|Harrison Ford',
        'Lana Wachowski|Lilly Wachowski|Keanu Reeves'
    ]
}

In [None]:
keywords_data = {
    'id': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'keywords': [
        'superhero|action|adventure',
        'romance|drama|epic',
        'drama|war|historical',
        'comedy|family|animation',
        'thriller|mystery|crime',
        'sci-fi|fantasy|adventure',
        'documentary|biography|history',
        'musical|romance|drama',
        'horror|suspense|mystery'
    ]
}

In [None]:
links_data = {
    'movieId': [1, 2, 3, 4, 5],
    'imdbId': ['tt0371746', 'tt0120338', 'tt0120815', 'tt0110912', 'tt0133093'],
    'tmdbId': [2734, 597, 78, 619, 550]
}


In [None]:
movies_metadata_data = {
    'id': [1, 2, 3, 4, 5],
    'title': ['Iron Man', 'Avatar', 'Forrest Gump', 'The Dark Knight', 'Inception'],
    'original_title': ['Iron Man', 'Avatar', 'Forrest Gump', 'The Dark Knight', 'Inception'],
    'release_date': ['2008-05-02', '2009-12-18', '1994-07-06', '2008-07-18', '2010-07-16'],
    'popularity': [100.0, 150.0, 90.0, 200.0, 180.0],
    'vote_average': [7.9, 8.0, 8.8, 8.4, 8.8],
    'vote_count': [12000, 13000, 14000, 16000, 15000]
}

In [None]:
ratings_data = {
    'userId': [1, 2, 3, 4, 5],
    'movieId': [1, 2, 3, 4, 5],
    'rating': [5, 4, 3, 5, 2],
    'timestamp': [1625248382, 1625248482, 1625248582, 1625248682, 1625248782]
}

In [None]:
ratings_small_data = {
    'userId': [1, 2, 3, 4, 5, 1, 2, 4],
    'movieId': [1, 2, 1, 3, 2, 4, 5, 3],
    'rating': [4, 5, 3, 2, 4, 5, 3, 4],
    'timestamp': [1625248382, 1625248482, 1625248582, 1625248682, 1625248782, 1625248882, 1625248982, 1625249082]
}


In [None]:
df_genres = {
    'id': [1, 2, 3, 4, 5],
    'genres': [['Action', 'Adventure'], ['Drama'], ['Action', 'Thriller'], ['Comedy', 'Romance'], ['Sci-Fi', 'Fantasy']]
}


In [None]:
for name, data in zip(
    ['credits', 'keywords', 'links', 'movies_metadata', 'ratings', 'ratings_small','df_genres'],
    [credits_data, keywords_data, links_data, movies_metadata_data, ratings_data, ratings_small_data,df_genres]
):
    df = pd.DataFrame(data)
    df.to_csv(f'{name}.csv', index=False)


In [None]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
links = pd.read_csv('links.csv')
movies_metadata = pd.read_csv('movies_metadata.csv', low_memory=False)
ratings = pd.read_csv('ratings.csv')
ratings_small = pd.read_csv('ratings_small.csv')
df_genres = pd.read_csv('df_genres.csv')

In [None]:
credits.head()

In [None]:
keywords.head()

In [None]:
links.head()

In [None]:
ratings.head()

In [None]:
movies_metadata.head(2)

In [None]:
df_genres.head()

In [None]:
# Credits Data
#Analysis:

# Cast and Crew Analysis: Determine the most common actors and crew members in successful movies.
# Movie Popularity: Analyze how having certain actors or directors affects movie popularity or ratings.

In [None]:
credits['cast'] = credits['cast'].fillna('')
cast_list = credits['cast'].str.split('|', expand=True).stack()
top_cast = cast_list.value_counts()
print("\nTop Cast:")
print(top_cast.head(10))

In [None]:
plt.figure(figsize=(4, 3))
top_cast.head(10).plot(kind='bar', title='Top 10 Actors by Movie Count')
plt.xlabel('Actor')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
merged_data = credits.merge(movies_metadata[['id', 'title', 'popularity']], left_on='id', right_on='id')

cast_popularity = merged_data[['cast', 'popularity']].copy()
cast_popularity = cast_popularity.explode('cast')
cast_popularity = cast_popularity.groupby('cast').agg({'popularity': 'mean'}).reset_index()
top_cast_popularity = cast_popularity.sort_values(by='popularity', ascending=False)
print("\nTop Cast by Average Popularity:")
print(top_cast_popularity.head(10))


In [None]:
plt.figure(figsize=(3, 2))
top_cast_popularity.head(10).plot(kind='bar', x='cast', y='popularity', title='Top 10 Actors by Average Popularity')
plt.xlabel('Actor')
plt.ylabel('Average Popularity')
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
#credits['crew'] = credits['crew'].fillna('')
crew_list = credits['crew'].str.split('|', expand=True).stack()
top_crew = crew_list.value_counts()

print("\nTop Crew Members:")
print(top_crew.head(10))

In [None]:
plt.figure(figsize=(8, 4))
top_crew.head(10).plot(kind='bar', title='Top 10 Crew Members by Movie Count')
plt.xlabel('Crew Member')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Keywords Data
# Analysis:

#Keyword Frequency: Find out the most common keywords associated with movies.
#Keyword vs. Popularity: Explore how specific keywords relate to movie popularity.

In [None]:
keyword_list = keywords['keywords'].str.split('|', expand=True).stack()
top_keywords = keyword_list.value_counts()
top_keywords

In [None]:
plt.figure(figsize=(6, 4))
top_keywords.head(10).plot(kind='bar', title='Top 10 Keywords by Frequency')
plt.xlabel('Keyword')
plt.ylabel('Frequency')
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
keywords_movies = keywords.merge(movies_metadata[['id', 'popularity']], left_on='id', right_on='id')
keywords_popularity = keywords_movies[['keywords', 'popularity']].copy()
keywords_popularity = keywords_popularity.explode('keywords')
keyword_popularity = keywords_popularity.groupby('keywords').agg({'popularity': 'mean'}).reset_index()
top_keyword_popularity = keyword_popularity.sort_values(by='popularity', ascending=False)
top_keyword_popularity

In [None]:
plt.figure(figsize=(8, 4))
top_keyword_popularity.head(10).plot(kind='bar', x='keywords', y='popularity', title='Top 10 Keywords by Average Popularity')
plt.xlabel('Keyword')
plt.ylabel('Average Popularity')
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
#Links Data
# Analysis:



In [None]:
unique_movie_ids = links['movieId'].nunique()
total_movie_ids = links['movieId'].count()
total_movie_ids

In [None]:
#Movies Metadata
# Analysis:

#Budget vs. Revenue: Investigate how budget impacts revenue.
#Popularity and Ratings: Explore relationships between movie popularity, ratings, and other features.
#Genre Analysis: Analyze the distribution of genres and their impact on ratings and revenue.

In [None]:
movies_metadata.head()

In [None]:
movies_metadata.info()

In [None]:
movies_metadata.describe()


In [None]:
movies_metadata.shape


In [None]:
movies_metadata.isnull().sum()

In [None]:
movies_metadata.fillna('', inplace=True) 

In [None]:
plt.figure(figsize=(5, 3))
plt.scatter(movies_metadata['popularity'], movies_metadata['vote_average'])
plt.title('Popularity vs. Average Rating')
plt.xlabel('Popularity')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

In [None]:
df_genres['genres'] = df_genres['genres'].apply(lambda x: eval(x) if isinstance(x, str) else x)
df_genres = df_genres.explode('genres')
df_genres

In [None]:
df_combined = movies_metadata.merge(df_genres, on='id')
df_combined

In [None]:
genre_counts = df_combined['genres'].value_counts()
genre_counts

In [None]:
plt.figure(figsize=(8, 2))
genre_counts.plot(kind='bar', title='Genre Distribution',color='skyblue')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45, fontsize=10)
plt.show()

In [None]:
#Ratings Data
#Analysis:

#Ratings Distribution: Explore the distribution of ratings.
#Movie Ratings: Analyze average ratings for each movie.
# User Behavior: Examine the number of ratings per user and how it might affect average ratings.

In [None]:
plt.figure(figsize=(5, 3))
ratings['rating'].hist(bins=10, color='skyblue', edgecolor='black')
plt.title('Ratings Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
average_ratings_per_movie = ratings.groupby('movieId')['rating'].mean().reset_index()
average_ratings_per_movie

In [None]:

plt.figure(figsize=(8, 4))
average_ratings_per_movie.sort_values(by='rating', ascending=False).head(20).plot(kind='bar', x='movieId', y='rating', legend=False, color='salmon')
plt.title('Top 20 Movies by Average Rating')
plt.xlabel('Movie ID')
plt.ylabel('Average Rating')
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.show()


In [None]:
ratings_per_user = ratings.groupby('userId').size()
ratings_per_user

In [None]:
plt.figure(figsize=(5, 3))
ratings_per_user.hist(bins=30, color='lightgreen', edgecolor='black')
plt.title('Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
average_rating_per_user = ratings.groupby('userId')['rating'].mean()
average_rating_per_user



In [None]:
user_behavior = pd.DataFrame({'ratings_count': ratings_per_user, 'average_rating': average_rating_per_user}).reset_index()

plt.figure(figsize=(5, 4))
plt.scatter(user_behavior['ratings_count'], user_behavior['average_rating'], alpha=0.5, edgecolors='k')
plt.title('Average Rating vs. Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Average Rating')
plt.xscale('log')
plt.yscale('linear')
plt.grid(True)
plt.show()

In [None]:
# Combine Ratings with Movies Metadata:

In [None]:
movies_metadata = movies_metadata.rename(columns={'id': 'movieId'})

merged_df = pd.merge(ratings, movies_metadata, on='movieId')
merged_df

In [None]:
average_ratings = merged_df.groupby('movieId')['rating'].mean().reset_index()
average_ratings = average_ratings.rename(columns={'rating': 'average_rating'})
average_ratings

In [None]:
merged_ratings_popularity = pd.merge(average_ratings, movies_metadata[['movieId', 'popularity']], on='movieId')
merged_ratings_popularity

In [None]:
plt.figure(figsize=(5, 3))
plt.scatter(merged_ratings_popularity['popularity'], merged_ratings_popularity['average_rating'], alpha=0.5, edgecolors='k')
plt.title('Average Rating vs. Popularity')
plt.xlabel('Popularity')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

In [None]:
merged_df['release_date'] = pd.to_datetime(merged_df['release_date'], errors='coerce')
merged_df['release_year'] = merged_df['release_date'].dt.year

rating_by_year = merged_df.groupby('release_year')['rating'].mean().reset_index()
rating_by_year


In [None]:
plt.figure(figsize=(5, 3))
plt.plot(rating_by_year['release_year'], rating_by_year['rating'], marker='o', linestyle='-', color='b')
plt.title('Average Rating by Release Year')
plt.xlabel('Release Year')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()


In [None]:
# Merge Credits with Movies Metadata

In [None]:
credits = credits.rename(columns={'id': 'movieId'})
credits[['cast', 'crew']].head()

In [None]:
movies_metadata = movies_metadata.rename(columns={'id': 'movieId'})

movies_metadata[['movieId', 'title', 'release_date']].head()

In [None]:
merged_df = pd.merge(credits, movies_metadata, on='movieId')
merged_df