In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from fbprophet import Prophet
import pycountry
import plotly.express as px
from collections import namedtuple
import pandas_profiling
from IPython.display import display
import collections, numpy

In [None]:
df = pd.read_csv('../input/imdb-india-movies/IMDb Movies India.csv')
df.head()

# Plotting and distribution of dataset.

###Below plot help to understand the data and provided me with insight to analyze the dataset correctly 

In [None]:
report = pandas_profiling.ProfileReport(df)
display(report)

## Cleaning the Data.
1. After looking at the Pandas Profiling Report we come to an acknowledgement that there are multipal duplicate values in the data set, these will need to be remove.
2. There are many missing values in the data set and if a movie does not have the required rating or year or director name then it will be of no use to the visualisation of the data and will be removed.
3. The year field ha brackets and we will need to remove these brackets and the movie duration field has min at the end this will have to be removed. so we have int and not strings to do our anaylsis.
4. The Genre has multipal entries in one column which is seperate by a ',' and we would need to seperate this into multipal unique values.

In [None]:
df.shape

In [None]:
null_rows = df[df.iloc[: , 1:9].isna().apply(lambda x: all(x), axis=1)]
print("Below are the Missing values for each column from 1 to 9:")
null_rows.head()

In [None]:
df = df[~df.iloc[: , 1:9].isna().apply(lambda x: all(x), axis=1)]
df.shape

In [None]:
duplicate = df[df.duplicated(subset = ['Name', 'Year'], keep = False)]
print("Below are the duplicate rows according to Name and Year:")
duplicate.head()

In [None]:
df.drop_duplicates(subset=['Name', 'Year'], inplace=True)
df.shape

In [None]:
null_rows = df[df.iloc[: , [1,2,4,5]].isna().apply(lambda x: all(x), axis=1)]
print("Below are the Missing values for each column from 1 to 5 excluding Genre:")
null_rows.head()

In [None]:
df = df[~df.iloc[: , [1,2,4,5]].isna().apply(lambda x: all(x), axis=1)]
df.shape

In [None]:
df['Year'] = df['Year'].str.replace(r'(', '').str.replace(r')', '')
df.head()

In [None]:
df['Duration'] = df['Duration'].str.replace(r' min', '')
df.head()

In [None]:
df.drop(df.loc[df['Year']=='2022'].index, inplace = True)
df.shape

## Data cleaning process has been completed without loosing much data.

# Plotting my data in charts.

Please open link to view code.

In [None]:
year_count=df.groupby('Year').apply(lambda x:x['Name'].count()).reset_index(name='Count')
fig = px.bar(year_count, y='Count', x='Year', text='Count',title='Number of movie by year of launch')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_mode='show', xaxis = dict (title = 'Year of Movie Release', titlefont_size=16), yaxis=dict(title='Count of Movies Released', titlefont_size=16,tickfont_size=14))
fig.show()

In [None]:
dummies = df['Genre'].str.get_dummies(sep=', ')
df_genre = pd.concat([df, dummies], axis=1)
df_genre = df_genre.replace(0, np.nan)
df_genre.drop(['Name','Year','Duration','Genre','Rating','Votes','Director','Actor 1','Actor 2','Actor 3'], axis=1).sum().sort_values(ascending=False)

In [None]:
df_genre_count = df_genre.drop(['Name','Duration','Genre','Rating','Votes','Director','Actor 1','Actor 2','Actor 3'], axis=1)
df_genre_count = df_genre_count.groupby('Year').sum()
df_genre_count.reset_index(level=0, inplace=True)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Action,
                    mode='lines',
                    name='Action',
                    marker_color = '#2E91E5'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Adventure,
                    mode='lines',
                    name='Adventure',
                    marker_color = '#E15F99'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Animation,
                    mode='lines',
                    name='Animation',
                    marker_color = '#1CA71C'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Biography,
                    mode='lines',
                    name='Biography',
                    marker_color = '#FB0D0D'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Comedy,
                    mode='lines',
                    name='Comedy',
                    marker_color = '#DA16FF'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Crime,
                    mode='lines',
                    name='Crime',
                    marker_color = '#222A2A'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Documentary,
                    mode='lines',
                    name='Documentary',
                    marker_color = '#B68100'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Drama,
                    mode='lines',
                    name='Drama',
                    marker_color = '#750D86'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Family,
                    mode='lines',
                    name='Family',
                    marker_color = '#EB663B'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Fantasy,
                    mode='lines',
                    name='Fantasy',
                    marker_color = '#511CFB'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.History,
                    mode='lines',
                    name='History',
                    marker_color = '#00A08B'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Horror,
                    mode='lines',
                    name='Horror',
                    marker_color = '#FB00D1'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Music,
                    mode='lines',
                    name='Music',
                    marker_color = '#FC0080'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Musical,
                    mode='lines',
                    name='Musical',
                    marker_color = '#B2828D'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Mystery,
                    mode='lines',
                    name='Mystery',
                    marker_color = '#6C7C32'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.News,
                    mode='lines',
                    name='News',
                    marker_color = '#778AAE'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count['Reality-TV'],
                    mode='lines',
                    name='Reality-TV',
                    marker_color = '#862A16'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Romance,
                    mode='lines',
                    name='Romance',
                    marker_color = '#A777F1'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count['Sci-Fi'],
                    mode='lines',
                    name='Sci-Fi',
                    marker_color = '#620042'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Short,
                    mode='lines',
                    name='Short',
                    marker_color = '#1616A7'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Sport,
                    mode='lines',
                    name='Sport',
                    marker_color = '#DA60CA'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Thriller,
                    mode='lines',
                    name='Thriller',
                    marker_color = '#6C4516'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.War,
                    mode='lines',
                    name='War',
                    marker_color = '#0D2A63'))
fig.add_trace(go.Scatter(x=df_genre_count.Year, y=df_genre_count.Western,
                    mode='lines',
                    name='Western',
                    marker_color = '#AF0038'))

fig.update_layout(
    title='Genre through the Years',
    xaxis_tickfont_size=14,
        height = 800,
    yaxis=dict(
        title='Count',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        y=0,
        x=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
))
fig.show()

In [None]:
Actor1 = df[['Actor 1','Year']].rename(columns = {'Actor 1': 'Actor'}, inplace = False)
Actor2 = df[['Actor 2','Year']].rename(columns = {'Actor 2': 'Actor'}, inplace = False)
Actor3 = df[['Actor 3','Year']].rename(columns = {'Actor 3': 'Actor'}, inplace = False)
Actor_Year = pd.concat([Actor1, Actor2, Actor3], ignore_index=True).dropna()
Actor_Year['Count'] = 1
Actor_Top = Actor_Year['Actor'].value_counts().rename_axis('Actor').reset_index(name='Count')

In [None]:
fig = px.bar(Actor_Top[0:20],
             x='Actor',
             y='Count',
             title = 'Top 20 actors by number of Movies made')
fig.show()

In [None]:
Top_Actor = Actor_Top[0:20]['Actor']
Actor_Year = Actor_Year[Actor_Year['Actor'].isin(Top_Actor)].set_index('Actor').loc[Top_Actor].reset_index(level=0)
Actor_Year = Actor_Year.sort_values('Year')
Actor_Year['Year'] =  pd.to_datetime(Actor_Year['Year'])

In [None]:
fig = px.strip(Actor_Year,
               x="Year",
               y='Actor',
               color = "Actor",
               color_discrete_sequence=px.colors.qualitative.Dark24,
               height = 600,
               title = 'Top 20 actors by number of Movies made through the Years')
fig.show()

In [None]:
Director_Year = df[['Director','Year']].dropna()
Director_Year['Count'] = 1
Director_Top = Director_Year['Director'].value_counts().rename_axis('Director').reset_index(name='Count')

In [None]:
fig = px.bar(Director_Top[0:20],
             x='Director',
             y='Count',
             title = 'Top 20 Director by number of Movies made')
fig.show()

In [None]:
Top_Director = Director_Top[0:20]['Director']
Director_Year = Director_Year[Director_Year['Director'].isin(Top_Director)].set_index('Director').loc[Top_Director].reset_index(level=0)
Director_Year = Director_Year.sort_values('Year')
Director_Year['Year'] =  pd.to_datetime(Director_Year['Year'])

In [None]:
fig = px.strip(Director_Year,
               x="Year",
               y='Director',
               color = "Director",
               color_discrete_sequence=px.colors.qualitative.Dark24,
               height = 600,
               title = 'Top 20 Director by number of Movies made through the Years')
fig.show()

In [None]:
dur_rat = df[['Duration', 'Rating', 'Votes']].dropna()
dur_rat['Duration'] = dur_rat['Duration'].astype('int64')
dur_rat['Votes'] = pd.to_numeric(dur_rat['Votes'], errors='coerce').fillna(0).astype(int)
dur_rat['Votes'] = dur_rat.Votes.replace(0, pd.np.nan).dropna()
dur_rat = dur_rat.sort_values('Duration',ascending=True)

In [None]:
fig = px.scatter_3d(dur_rat, x='Duration', y='Rating', z='Votes',
              color='Rating',title = '3D Plot of Duration, Rating and Votes')
fig.show()

In [None]:
sns.pairplot(dur_rat)
plt.show()

# 1st draft of dataset.

## Do let me know what you think and what more you would like to see. Please comment and let me know.