In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
amz = pd.read_csv('amazon_prime_titles.csv')
amz.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [3]:
amz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   director      7585 non-null   object
 4   cast          8435 non-null   object
 5   country       672 non-null    object
 6   date_added    155 non-null    object
 7   release_year  9668 non-null   int64 
 8   rating        9331 non-null   object
 9   duration      9668 non-null   object
 10  listed_in     9668 non-null   object
 11  description   9668 non-null   object
dtypes: int64(1), object(11)
memory usage: 906.5+ KB


## Director

In [54]:
# How should missing or empty values in columns like 'director', 'cast', 'country', and 'rating'
#  be handled to ensure they do not skew category-based aggregations or comparisons?
amz['director'].isnull().sum()

np.int64(2083)

In [55]:
amz[amz['cast'].isnull()].index

Index([  20,   25,   33,   34,   70,   72,   74,   76,   77,   78,
       ...
       9559, 9564, 9586, 9612, 9622, 9636, 9637, 9645, 9647, 9658],
      dtype='int64', length=1233)

In [60]:
amz.iloc[20]


show_id                                                       s21
type                                                      TV Show
title                                                  Zoboomafoo
director                                                      NaN
cast                                                          NaN
country                                                       NaN
date_added                                                    NaN
release_year                                                 2001
rating                                                       TV-Y
duration                                                 1 Season
listed_in                                                    Kids
description     Chris and Martin Kratt bring their enthusiasm ...
Name: 20, dtype: object

In [17]:
amz[amz['cast'].isnull()].index

Index([  20,   25,   33,   34,   70,   72,   74,   76,   77,   78,
       ...
       9559, 9564, 9586, 9612, 9622, 9636, 9637, 9645, 9647, 9658],
      dtype='int64', length=1233)

In [13]:
amz['country'].nunique()

86

### CAST


In [19]:
missing_rows = amz[amz['cast'].isnull()]
print(missing_rows.head())


   show_id     type                             title            director  \
20     s21  TV Show                        Zoboomafoo                 NaN   
25     s26    Movie            Zambezi: Force of Life  Dr. Rudolf Lammers   
33     s34    Movie   Young Love (at the Sun Factory)                 NaN   
34     s35    Movie   Young Love (at Bowery Ballroom)                 NaN   
70     s71    Movie  World Inferno Friendship Society        Baeble Music   

   cast country date_added  release_year rating  duration  \
20  NaN     NaN        NaN          2001   TV-Y  1 Season   
25  NaN     NaN        NaN          2005    ALL    50 min   
33  NaN     NaN        NaN          2007     NR    36 min   
34  NaN     NaN        NaN          2007     NR    37 min   
70  NaN     NaN        NaN          2007     NR    40 min   

                    listed_in  \
20                       Kids   
25                Documentary   
33  Music Videos and Concerts   
34  Music Videos and Concerts   
70  M

## Rating

In [23]:
amz['rating'].value_counts()

rating
13+         2117
16+         1547
ALL         1268
18+         1243
R           1010
PG-13        393
7+           385
PG           253
NR           223
TV-14        208
TV-PG        169
TV-NR        105
G             93
TV-G          81
TV-MA         77
TV-Y          74
TV-Y7         39
UNRATED       33
NC-17          3
AGES_18_       3
NOT_RATE       3
AGES_16_       2
16             1
ALL_AGES       1
Name: count, dtype: int64

In [30]:
rating_map = {'AGES_16': "16+",
              "16":"16+",
              "AGES_16_":"16+",
              'AGES_18_':"18+",
              "ALL_AGES":"ALL",
              "NOT_RATE":"NR",
              "UNRATED":"NR",
              "NC-17":"17+",
              "TV-NR":"NR"
}

amz['rating'] = amz['rating'].replace(rating_map)

In [31]:
amz['rating'].unique()

array([nan, '13+', 'ALL', '18+', 'R', 'TV-Y', 'TV-Y7', 'NR', '16+',
       'TV-PG', '7+', 'TV-14', 'TV-G', 'PG-13', 'TV-MA', 'G', 'PG', '17+'],
      dtype=object)

In [33]:
amz['rating'].fillna('NR', inplace=True)

In [34]:
# Calculate missing values count and percentage for each column
missing_percent = (amz.isnull().sum() / len(amz)) * 100
print(missing_percent)


show_id          0.000000
type             0.000000
title            0.000000
director        21.545304
cast            12.753413
country         93.049235
date_added      98.396773
release_year     0.000000
rating           0.000000
duration         0.000000
listed_in        0.000000
description      0.000000
dtype: float64


In [None]:
# What steps are required to standardize the 'date_added' column, converting string dates (e.g., "March 30, 2021") 
# into a consistent datetime format for time-series analysis?

In [None]:
# How can the 'duration' column be cleaned to separate numerical values from units (e.g., "113 min" or "1 Season"), and 
# should TV shows and movies be processed differently for uniformity?

In [None]:
# What approach should be taken to normalize the 'listed_in' column, 
# which contains multiple genres or categories (e.g., "Comedy, Drama"), for easier splitting and multi-label analysis?

In [None]:
# How should inconsistencies in the 'rating' column (e.g., "13+", "ALL", "PG-13", "NR") 
# be addressed to create standardized age rating categories?

In [None]:
# What methods can be used to detect and remove duplicate entries based on 'title' and 'release_year', 
# considering potential variations in formatting or partial matches?

In [None]:
# How should the 'cast' column be preprocessed to handle multiple actors (e.g., splitting comma-separated names) 
# for actor-based frequency or network analysis?

In [None]:
# What techniques are appropriate for validating and cleaning the 'release_year' 
# column to ensure all values are valid integers within a reasonable historical range?

In [None]:
# How can the 'description' column be preprocessed for text analysis, such as removing special characters, 
# handling truncation indicators (e.g., "..."), or tokenizing for keyword extraction?

In [None]:
# What preprocessing steps are needed to handle quoted strings in columns like 'title' or 'description' that may contain commas, to prevent parsing errors in CSV reading?


In [None]:
#What is the distribution of content types (Movie vs. TV Show) across different countries, 
# and how does this vary by release year?
