## EDA ON IMDB DATA SET

### import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

### import the csv file

In [None]:
df=pd.read_csv("/kaggle/input/imdb-top-1000-movies/regex_imdb.csv")

### 01. Get the basic information about the data set

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.dtypes

### 02. Check the null values

In [None]:
print("Percentage Of Missing Values")
Perc_Of_Missing_Values=df.isna().sum()/len(df)*100
Perc_Of_Missing_Values[Perc_Of_Missing_Values!=0]

In [None]:
only_missing=Perc_Of_Missing_Values[Perc_Of_Missing_Values!=0]
only_missing.plot(kind="bar")
plt.title("% Age of Missing Values")
plt.show()

#### As we can see 22% of meta-score values are null values & 17% of gross values are null values! . Also some of the values are missing in the director column as well

#### so we will impute the missing values of meta-score and gross-value

In [None]:
df["Meta_score"]=df["Meta_score"].fillna(df["Meta_score"].median())

In [None]:
import numpy as np

In [None]:
df["Meta_score"]=df["Meta_score"].apply(np.round)

In [None]:
df["Gross"]=df["Gross"].fillna(df["Gross"].median())

In [None]:
#df["Gross"]=df["Gross"].apply(np.round,decimals=2)

In [None]:
df["Director"]=df["Director"].fillna(df["Director"].mode()[0])

In [None]:
df.isna().sum()

#### check the data types and change them

In [None]:
df.dtypes

#### meta-score data type is float lets change it to integer

In [None]:
df["Meta_score"]=df["Meta_score"].astype("int")

In [None]:
df.dtypes

##  03.Now let's do some analysis

##  i) Univariate

In [None]:
df.head(2)

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(df['Run_time'],kde=False, color=['red'])
plt.title('Distplot with Normal distribution for run time',fontweight="bold")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
sns.distplot(df["Meta_score"])
plt.title("DISTPLOT WITH  META_SCORE")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(df['Gross'],kde=False, color=['red'])
plt.title('Distplot with Normal distribution for Gross',fontweight="bold")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
px = sns.countplot(x="Rating", data=df,palette="Set1",  order=df['Rating'].value_counts().index[0:])
plt.title("COUNT OF RATINGS")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
sns.countplot(y="Genre", data=df,  order=df['Genre'].value_counts(ascending=False).index[0:10])
plt.title("COUNT OF GENRES")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
sns.distplot(df["Year"], kde=False)
plt.title('DISTPLOT WITH YEAR')
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(df["Year"])
plt.title("MOVIES RELEASED PER YEAR",fontsize=20)
plt.xlabel("RELEASED YEAR")
plt.ylabel("NUMBER OF MOVIES")
plt.xticks(rotation=90)
plt.show()


In [None]:
plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
sns.countplot(y="Year", data=df, palette="Set2", order=df['Year'].value_counts(ascending=False).index[0:10])
plt.title('Movies released per year')
plt.show()


In [None]:
plt.figure(figsize=(6,5))
sns.countplot(x="Director",data=df,palette="Set1",  order=df['Director'].value_counts().index[0:10])
plt.xticks(rotation=45)
plt.title("WHICH DIRECTOR DIRECTS MOST NUMBER OF MOVIES")
plt.show()

In [None]:
plt.figure(figsize=(6,5))
sns.set(style="darkgrid")
ax = sns.countplot(y="Gross", data=df, palette="Set2", order=df['Gross'].value_counts(ascending=False).index[0:10])
plt.title('COUNT OF GROSS OF  MOVIES')
plt.show()

## 02. BIVARIATE ANALYSIS

In [None]:
df.head(2)

In [None]:
pd.crosstab(df["Run_time"],df["Year"]).mean()

In [None]:
avg=df["Run_time"].groupby(df["Year"]).describe()

In [None]:
avg

In [None]:
avg_runtime=avg["mean"]
avg_runtime_min=avg["mean"]-avg["std"]
avg_runtime_max=avg["mean"]+avg["std"]

In [None]:
fig, x=plt.subplots(figsize=(10,5))
x.plot(avg_runtime,color='blue')
x.plot(avg_runtime_min,color='green')
x.plot(avg_runtime_max,color='skyblue')
x.fill_between(avg.index,avg_runtime_min,avg_runtime_max,color="skyblue")
x.set_title("AVERAGE MOVIE RUN-TIME OVER THE YEARS",fontsize=20)
x.set_xlabel("RELEASE YEAR",fontsize=15)
x.set_ylabel("MINUTES",fontsize=15)
x.set_xlim(1960)
plt.show()



In [None]:
movies_by_decades=df.copy()

In [None]:
movies_by_decades["Year"]=((movies_by_decades["Year"]//10)*10).astype("int64")

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x="Year",y="Run_time",data=movies_by_decades,palette ="colorblind",showfliers=False)
plt.title("MOVIES RUN TIME OVER THE DECADES")
plt.xlabel("DECADES",fontsize=15)
plt.ylabel("AVERAGE RUNTIME",fontsize=15)
plt.show()

In [None]:
movies_by_decades["Year"]

In [None]:
df.groupby(by=df["Year"])["Run_time"].mean()

In [None]:
df.head(2)

In [None]:
v=df.groupby(df["Year"])["Votes"].mean().reset_index()

In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(x="Year",y="Votes",data=v)
plt.title("AVERAGE MOVIE VOTES OVER THE YEARS",fontsize=20)
plt.show()

In [None]:
df["Genre"]

In [None]:
df.groupby(df["Genre"])["Year"].sum().sort_values()

In [None]:
df.head(2)

In [None]:
vot=df.groupby(df["Name"])["Votes"].max().sort_values()[-10:].reset_index()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=vot["Name"],y=vot["Votes"])
plt.xticks(rotation=90)
plt.title("TOP 10 MOVIES WITH HIGHEST VOTES",fontsize=20)
plt.show()

In [None]:
sns.scatterplot(y="Rating",x="Votes",data=df)

In [None]:
c=["Year","Run_time","Rating","Meta_score","Votes"]
#plt.figure(figsize=(15,7))
for a in c:
    sns.scatterplot(y="Gross",x=a,data=df)
   
    
    plt.show()
    

In [None]:
bo=df[["Genre","Gross"]].sort_values("Gross")[-10:]

In [None]:
gross=df.groupby(df["Genre"])["Gross"].sum().sort_values()[-5:].reset_index()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x="Genre",y="Gross",data=bo)

In [None]:
c=["Year","Run_time","Rating","Meta_score","Votes"]
#plt.figure(figsize=(15,7))
for a in c:
    sns.boxplot(x=a,data=df)
   
     
    plt.show()
    

In [None]:

#df.groupby(df["Genre"])["Rating"].min().sort_values(ascending=False).head(10).plot(kind="line")

#plt.xticks(rotation=90)

In [None]:
df.groupby(df["Genre"])["Rating"].max().head(10).sort_values(ascending=False).plot(kind="bar")
plt.title("TOP 10 GENRES WITH HIGHEST RATING")
plt.show()

In [None]:
df.head(2)

In [None]:
dd = pd.DataFrame()
dd['cast'] = np.hstack([np.array(i.split(',')) for i in df.cast])



In [None]:
dd.head(3)

In [None]:
xx=pd.DataFrame()
xx["gen"]=np.hstack([np.array(x.split(",")) for x in df.Genre])

In [None]:
xx["gen"]=xx["gen"].str.strip()

In [None]:
xx["gen"].value_counts()

In [None]:
sns.countplot(x="gen",data=xx,order=xx["gen"].value_counts().index[0:10])
plt.xticks(rotation=45)
plt.show()

In [None]:
dd

In [None]:
df.head(2)

In [None]:
dd["cast"].value_counts()

In [None]:
sns.countplot(x="cast",data=dd,order=dd["cast"].value_counts().index[0:10])
plt.xticks(rotation=45)
plt.title("NUMBER OF CAST APPEARED IN MOVIES")
plt.show()

In [None]:
import plotly.graph_objects as go


In [None]:
nam=df.groupby(df["Name"])["Gross"].max().sort_values(ascending=False).head(10).reset_index()

In [None]:
nam["Gross"]

In [None]:
plt.figure(figsize=(10,5))

x=df.groupby(df["Name"])["Gross"].max().sort_values(ascending=False).head(10)
sns.barplot(x.values,x.index)
plt.title("Top Ten Highest Grossed Movies Of All Time")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
x=df.groupby(df["Genre"])["Gross"].max().sort_values(ascending=False).head(10)
sns.barplot(x.values,x.index)
plt.title("TOP TEN GENRES WITH HIGHEST GROSS")
plt.show()

In [None]:
df[df["Name"]=="Star Wars: Episode VII - The Force Awakens"]

In [None]:
x=df.groupby(by=df["Director"])["Gross"].max().sort_values(ascending=False).head(10)
plt.title("DIRECTORS WHO HAVE DIRECTED THE TOP  GROSS MOVIES")
y=sns.barplot(x.index,x.values)
y.set_xlabel("DIRECTORS")
plt.xticks(rotation=45)
plt.show()

In [None]:
df.groupby(df["cast"])["Gross"].max().sort_values(ascending=False).head(5)
# casts that were appeared in the movies which grossed the highest

## 03. Multi-Variate Analysis:

In [None]:
plt.figure(figsize=[8,8])
sns.heatmap(df.corr(), annot=True,cmap='YlGnBu')

In [None]:
df.head(2)

In [None]:
top_10_movies=df[["Name","Gross"]].sort_values("Gross")[-10:].reset_index()

In [None]:
xx=df[["Name","Gross","Director"]].sort_values("Gross")[-10:].reset_index()

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Name",y="Gross",hue="Director",data=xx)
plt.title("DIRECTORS OF TOP 10 HIGHEST GROSSED MOVIES ",fontsize=20)
plt.xticks(rotation=90)
plt.show()

In [None]:
df.head(2)

In [None]:
vt=df[["Name","Votes","Gross"]].sort_values("Votes")[-10:]

In [None]:
vt

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x="Name",hue="Votes",y="Gross",data=vt)
plt.title("GROSS OF TOP 10 HIGHEST VOTED MOVIES",fontsize=20)
plt.xticks(rotation=90)
plt.show()

In [None]:
bb=df[["Name","Rating","Gross"]].sort_values("Rating").reset_index()[-10:]

In [None]:
bb

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x="Name",y="Gross",hue="Rating",data=bb)
plt.title("GROSS OF THE TOP 10 HIGHEST RATING MOVIES",fontsize=20)
plt.xticks(rotation=90)
plt.show()

In [None]:
df.head(2)

In [None]:
cc=df[["Name","Meta_score","Gross"]].sort_values("Meta_score").reset_index()[-10:]

In [None]:
cc

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x="Name",y="Gross",hue="Meta_score",data=cc)
plt.title("GROSS OF TOP 10 HIGHEST META-SCORE MOVIES",fontsize=20)
plt.xticks(rotation=90)
plt.show()

In [None]:
df.head(2)

In [None]:
e=df[["Name","cast","Gross"]].sort_values("Gross").reset_index()[-10:]

In [None]:
e

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(x="Name",y="Gross",hue="cast",data=e)
plt.title("CAST OF THE TOP 10 HIGHEST GROSSED MOVIES",fontsize=20)
plt.xticks(rotation=30)
plt.show()