# 🎬 Movies Dataset Analysis Notebook

# Load Dataset

In [9]:

import pandas as pd

from google.colab import files
uploaded = files.upload()

# Load dataset
df = pd.read_csv("movies.csv")
df.head()


Saving movies.csv to movies (1).csv


Unnamed: 0,title,genres,keywords,overview
0,The Shawshank Redemption,Drama,"prison, friendship, hope",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama","mafia, crime family, loyalty",The aging patriarch of an organized crime dyna...
2,The Dark Knight,"Action, Crime, Drama","joker, vigilante, chaos","Batman faces the Joker, a criminal mastermind ..."
3,Inception,"Action, Sci-Fi, Thriller","dream, subconscious, heist",A thief who steals corporate secrets through d...
4,Interstellar,"Adventure, Drama, Sci-Fi","space travel, time dilation, love",A team of explorers travel through a wormhole ...


# Basic Information

In [10]:

# Check shape, columns, and data types
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())


Shape: (10, 4)
Columns: ['title', 'genres', 'keywords', 'overview']

Data Types:
 title       object
genres      object
keywords    object
overview    object
dtype: object

Missing Values:
 title       0
genres      0
keywords    0
overview    0
dtype: int64


#Genre Analysis

In [11]:

# Count most common genres
from collections import Counter

genre_list = []
df['genres'].dropna().apply(lambda x: genre_list.extend(x.split(", ")))

genre_counts = Counter(genre_list)
print("Most Common Genres:")
print(genre_counts.most_common(10))


Most Common Genres:
[('Drama', 8), ('Crime', 3), ('Action', 3), ('Sci-Fi', 3), ('Romance', 3), ('Thriller', 2), ('Adventure', 2), ('Comedy', 1)]


# Keyword Analysis

In [12]:

# Count most common keywords
keyword_list = []
df['keywords'].dropna().apply(lambda x: keyword_list.extend(x.split(", ")))

keyword_counts = Counter(keyword_list)
print("Most Common Keywords:")
print(keyword_counts.most_common(10))


Most Common Keywords:
[('love', 3), ('prison', 1), ('friendship', 1), ('hope', 1), ('mafia', 1), ('crime family', 1), ('loyalty', 1), ('joker', 1), ('vigilante', 1), ('chaos', 1)]


# Overview Length Analysis

In [13]:

# Overview length analysis
df['overview_length'] = df['overview'].apply(lambda x: len(str(x).split()))
df[['title', 'overview_length']].head()


Unnamed: 0,title,overview_length
0,The Shawshank Redemption,19
1,The Godfather,18
2,The Dark Knight,20
3,Inception,24
4,Interstellar,17


# Movie Recommendation System

In [14]:

# Simple Content-Based Movie Recommender
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Combine genres + keywords + overview
df['combined'] = df['genres'].fillna('') + ' ' + df['keywords'].fillna('') + ' ' + df['overview'].fillna('')

# Vectorize text
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

# Cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Recommendation function
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def recommend(title, n=5):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# Example: recommend movies like Inception
recommend("Inception")


Unnamed: 0,title
7,Avengers: Endgame
4,Interstellar
8,Joker
2,The Dark Knight
0,The Shawshank Redemption



## Conclusion  
We explored the movies dataset by analyzing genres, keywords, and overviews.  
Using text similarity (TF-IDF + cosine similarity), we built a simple **movie recommender system** that suggests similar movies based on content.  
