# 📺 Netflix Data Analysis Project

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

%matplotlib inline
sns.set(style="whitegrid")


ModuleNotFoundError: No module named 'pandas'

In [None]:

# Load CSV from the correct path
data = pd.read_csv("data/netflix1.csv")
data.head()


In [None]:

# Check for missing values and clean the data
print("Missing Values:\n", data.isnull().sum())
data.drop_duplicates(inplace=True)
data['date_added'] = pd.to_datetime(data['date_added'], errors='coerce')
data.dtypes


In [None]:

showtype = data['type'].value_counts().reset_index()
showtype.columns = ['type', 'count']

plt.figure(figsize=(6, 6))
plt.pie(showtype['count'], labels=showtype['type'], autopct='%1.1f%%', colors=['gold', 'skyblue'])
plt.title("Distribution of Movies vs TV Shows")
plt.show()


In [None]:

data['genres'] = data['listed_in'].fillna('').apply(lambda x: x.split(', '))
all_genres = sum(data['genres'], [])
genre_counts = pd.Series(all_genres).value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='viridis')
plt.title('Most Common Genres on Netflix')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.show()


In [None]:

data['year_added'] = data['date_added'].dt.year
plt.figure(figsize=(12,6))
sns.countplot(x='year_added', data=data, palette='coolwarm')
plt.title("Content Added Over Time")
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:

movie_titles = data[data['type']=='Movie']['title'].dropna()
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(' '.join(movie_titles))

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Frequent Words in Movie Titles")
plt.show()


In [None]:

directors = data['director'].value_counts().reset_index()
directors.columns = ['director', 'count']
top_directors = directors[1:15]

plt.figure(figsize=(10,6))
sns.barplot(x='count', y='director', data=top_directors, palette='YlGnBu')
plt.title("Top 15 Directors with Most Titles")
plt.show()


In [None]:

top_countries = data['country'].value_counts().head(10).reset_index()
top_countries.columns = ['country', 'count']

plt.figure(figsize=(10,6))
sns.barplot(x='country', y='count', data=top_countries, palette='Set2')
plt.title("Top 10 Countries with Most Netflix Titles")
plt.xticks(rotation=45)
plt.show()


In [None]:

ratings = data.groupby(['rating', 'type']).size().reset_index(name='count')
movie_ratings = ratings[ratings['type'] == 'Movie'].sort_values(by='count', ascending=False).head(10)
tv_ratings = ratings[ratings['type'] == 'TV Show'].sort_values(by='count', ascending=False).head(10)

plt.figure(figsize=(10,6))
sns.barplot(data=movie_ratings, x='rating', y='count', palette='colorblind')
plt.title('Top Movie Ratings')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(10,6))
sns.barplot(data=tv_ratings, x='rating', y='count', palette='pastel')
plt.title('Top TV Show Ratings')
plt.xticks(rotation=45)
plt.show()
