# Day 4 â€“ SQL-Style Analysis Using Pandas (Netflix Data)




In [9]:
import pandas as pd

df = pd.read_csv("/content/netflix_cleaned.csv")
df.head()

Unnamed: 0,id,title,type,description,year,age_rating,runtime,genres,production_countries,seasons,imdb_id,imdb_rating,imdb_votes_count,tmdb_popularity,tmdb_score,audience_group
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,documentation,['US'],1.0,,,,0.6,,Adults
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"drama, crime",['US'],,tt0075314,8.2,808582.0,40.965,8.179,Adults
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"drama, action, thriller, european",['US'],,tt0068473,7.7,107673.0,10.01,7.3,Adults
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"fantasy, action, comedy",['GB'],,tt0071853,8.2,534486.0,15.461,7.811,Kids
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,Unknown,150,"war, action","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6,Unknown


In [10]:
df.groupby("type").size().reset_index(name="count")

Unnamed: 0,type,count
0,MOVIE,3743
1,SHOW,2106


In [11]:
df[
    (df["type"] == "MOVIE") &
    (df["year"] > 2015)
].shape[0]

2783

In [12]:
import ast

(
    df["production_countries"]
    .apply(ast.literal_eval)
    .explode()
    .value_counts()
    .head(5)
    .reset_index()
    .rename(columns={"index": "country", 0: "count"})
)

Unnamed: 0,production_countries,count
0,US,2323
1,IN,622
2,GB,404
3,JP,287
4,FR,248


In [13]:
df.groupby("age_rating").size().sort_values(ascending=False)

Unnamed: 0_level_0,0
age_rating,Unnamed: 1_level_1
Unknown,2618
TV-MA,883
R,556
TV-14,474
PG-13,451
PG,233
TV-PG,188
G,124
TV-Y7,120
TV-Y,107


In [14]:
(
    df.groupby("year")
    .size()
    .reset_index(name="count")
    .query("count > 100")
    .sort_values("year")
)

Unnamed: 0,year,count
52,2012,107
53,2013,135
54,2014,153
55,2015,222
56,2016,362
57,2017,563
58,2018,773
59,2019,836
60,2020,814
61,2021,787


SQL-style reasoning notes:

- groupby() is used to replicate GROUP BY queries.
- Boolean filtering replicates WHERE conditions.
- value_counts() helps quickly identify dominant categories.
- Thinking in SQL terms improves clarity and correctness.
