In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#create a series from any iterable (eg: list)
my_list = [2,3,7,8,4,6,2,9]
series = pd.Series(my_list)
series

### Retrieve Data

In [None]:
import opendatasets as od
dataset_url = "https://www.kaggle.com/datasets/mazenramadan/imdb-most-popular-films-and-series"
od.download(dataset_url)

### Explore data

#### Load dataset

In [None]:
imdb = pd.read_csv("imdb-most-popular-films-and-series/imdb.csv")
imdb

In [None]:
#print top 5 rows of dataset
imdb.head()

In [None]:
#print any number of top rows of dataset
# df.head(number)

imdb.head(10)

In [None]:
#print last rows of dataframe
# df.tail()    #prints last 5 rows

imdb.tail()

In [None]:
#print any number of last rows of dataframe
# df.tail(number)

imdb.tail(15)

In [None]:
#see the number of rows and columns in the dataframe
# df.shape

imdb.shape

In [None]:
#see the columns in the dataframe
# df.columns

imdb.columns

In [None]:
#see the rows in the dataframe
# df.index

imdb.index

In [None]:
#see all columns, number of columns, 
#count of non-null values in each column, and data type of each column
# df.info()

imdb.info()

#in above output, 
RangeIndex = 6178 entries (i.e. total data = 6178 rows)
non-null count is also 6178 in all columns
it means no column has any null values

In [None]:
#summary statistics of numerical values
# df.describe()

imdb.describe()
#here, the only numeric column is Date

In [None]:
# to use describe() in object column:

# df.describe(include='object')

imdb.describe(include="object")

#output interpretation
#Name column:
#count = no. of rows of data in Name column = 6178
#unique = no. of unique (non-repeated) data in Name = 4820
#top = most repeated data in the column (highest frequency)
#freq = frequency of the most repeated i.e. top data

In [None]:
# see the data types of all columns
# df.dtypes

imdb.dtypes

### Filter Data (.loc[])

.loc[] provides the location based on the condition provided in []

new df or subset = df.loc[df.column == "value"]

In [None]:
kingkong = imdb.loc[imdb.Name == "King Kong"]
kingkong

#### Check for the presence of null values (NaNs)
##### df.isna() : prints whole dataframe with boolean values (True means null, False means non-null)
##### df.isna().any():  prints list of columns, and if null value is present in the column (True/False)

In [None]:
imdb.isna()

In [None]:
imdb.isna().any()

#### Select one column only

In [None]:
# df.column
# df["column"]
# returns a series

imdb.Name
imdb["Name"]

In [None]:
#check the type
type(imdb.Name)

#### Select multiple columns

In [None]:
imdb[["Name", "Rate"]]

In [None]:
## Get value in particular index of a column
# first value of Name column = df.Name[0]

imdb["Name"][0]

In [None]:
# make copy of a dataframe (perform operations on the copy so that original dataframe is unaffected)
# new_name = df.copy()

imdb1 = imdb.copy()
imdb1.head()

In [None]:
print(imdb1.head())

### Data Manipulation

In the imdb dataset, Votes column has numbers but there is comma (eg: 107,163). So it is being recognized as an object (string). To convert it into a number/integer, remove the comma first.
This can be done by using a for loop

In [None]:
# imdb.shape gives (row_number, column_number)
# imdb.shape[0] means number of rows
# for each value in range [ 0 to number of rows]:
        #replace ',' with nothing ''
for i in range(0,imdb1.shape[0]):
    imdb1['Votes'][i] = imdb1['Votes'][i].replace(',', '')

In [None]:
imdb1.head()    #comma from Votes removed

In [None]:
# doing same thing using apply() function
imdb.head()

### Apply function

In [None]:
#define a function that replaces ',' with ''

def convert_votes(vote):
    return vote.replace(',', '')

#apply the function to Votes column of imdb df
imdb["Votes"] = imdb["Votes"].apply(convert_votes)

In [None]:
imdb.Votes      #data type is still object. Change datatype using .astype()

### Convert Data Type:  .astype()

.astype() takes a dictionary {} with columnname and desired datatype.  

.astype() works only to convert between same form of datatype. (doesn't work for string to integer)

df.astype({"Column":"DataType"})

#### Convert from one numeric form to another (int64 to int32)

In [None]:
imdb1.astype({"Date":"int32"})  #this returns the dataframe
imdb1.astype({"Date":"int32"}).dtypes   #returns columns and their datatypes

#### Convert from string(object) form to numeric
### pd.to_numeric()
df["column"] = pd.to_numeric(df["column"], errors = "coerce", downcast = "numeric_data_type")


In [None]:
imdb1["Rate"] = pd.to_numeric(imdb1["Rate"], errors = "coerce", downcast = "integer")

##### downcast --> data type hierarchy in python
= 'integer' means that while converting from object to numeric, it won't be downcasted lower than integer in the hierarchy.

In [None]:
imdb1.dtypes

In [None]:
imdb1["Votes"] = pd.to_numeric(imdb["Votes"], errors = "coerce", downcast = "integer")
imdb1["Duration"] = pd.to_numeric(imdb["Duration"], errors = "coerce", downcast = "integer")

In [None]:
imdb1.info()

In Rate, Votes, and Duration columns, number of non-null values has decreased. This is because, values containing strings (eg: No Votes) were converted to NaN values

In [None]:
imdb1.isnull()

In [None]:
#check number of null values in each column
# df.isnull().sum()

imdb1.isnull().sum()

In [None]:
imdb1.isna().sum()   #same as .isnull()

In [None]:
#Now if we .describe(), Rate, Votes, Duration columns will also be described.
imdb1.describe()

In [None]:
#selects entire row and checks if there is NaN value or not
#if yes, prints NaN in that row value
#if no, prints the data

#this works only for numeric data(int, float, complex)
imdb1.loc[:, imdb1.isna().any()]

In [None]:
#display NaN values of a column
#returns those rows of entire dataset, where value of the selected column is NaN

imdb1[imdb1.Duration.isna()]

In [None]:
# Axis 0 will act on all ROWS in each  COLUMN
# Axis 1 will act on all COLUMNS in each  ROW

imdb1[imdb1.isnull().any(axis=1)]

#displays data for which any of the columns contain NaN value

### Handling Missing Values

#### dropna() removes all NaN values
If a row has a NaN value, the entire row gets removed

##### Replace NaN value by 0 (for numeric columns)  using .replace(np.nan, 0)

In [None]:
imdb1["Rate"]  =  imdb1["Rate"].replace(np.nan, 0)
imdb1["Votes"] = imdb1["Votes"].replace(np.nan, 0)
imdb1["Duration"] = imdb1["Duration"].replace(np.nan, 0)

#### fillna() detects NaN values and replaces with desired value

In [None]:
imdb2 = imdb.copy()
imdb2 = imdb2.fillna(0)

In [None]:
imdb2.describe()

#minimum value of vote is 0.00000 because NaN values have been replaced with zero

## Data Visualization

##### import seaborn library of python

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline    
import seaborn as sns

#inline enables matplotlib to display or visualize internally in notebook itself (not in another tab)

#### Year wise movies and web series count

In [None]:
plt.figure(figsize = (20, 5))
#figsize=(20,5) means height of figure=20 and width=5 rows

#To get year-wise count, keep 'Date' column in X-axis
sns.countplot(x = imdb1["Date"].sort_values(ascending=False))

#the plot is created. We can rotate x-axis labels by using rotation
plt.xticks(rotation=90)

plt.show()

In [None]:
plt.figure(figsize=(10,7))
sns.set(style="dark")
sns.histplot(data=imdb1, x="Rate", color="brown", binwidth=1)

In [None]:
#Rate vs Votes
#Put rate in x-axis and votes in y-axis

plt.figure(figsize = (20, 6))
sns.scatterplot(x=imdb1.Rate, y=imdb1.Votes)
plt.show()

#### Q1. Top 10 Films and  Series as per rating

In [None]:
top_10 = imdb1.nlargest(10, "Rate")
top_10

In [None]:
plt.figure(figsize = (5,6))
plot = sns.countplot(x="Type", data=top_10)

In [None]:
top_10.loc[top_10.Type=="Film"]

In [None]:
top_10.loc[top_10.Type=="Series"]

#### Q2. Top 10 Films as per rating and voting

In [None]:
#get films only
films = imdb1.loc[imdb1.Type=="Film"]

In [None]:
#get top 10 films as per rating
top_films = films.nlargest(10, "Rate")
top_films[["Name", "Rate"]]

In [None]:
#top 10 films as per voting
top_films = films.nlargest(10, "Votes")
top_films[["Name", "Votes"]]

#### Q3. Get the detail of the third movie of the dataframe

In [None]:
imdb1.iloc[2]

#### Q4 Sort movies by duration, with the longest movie at the top

In [None]:
imdb1.sort_values("Duration", ascending=False)

#### Q5. List the movies belonging to each genre

In [None]:
imdb2 = imdb1.copy()