In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing the libraries
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
%matplotlib inline
matplotlib.rcParams['figure.figsize']=(12,8)


Reading the data

In [None]:
df = pd.read_csv("/kaggle/input/movies/movies.csv")
df

In [None]:
# displaying the top 5 rows
df.head()

## Data cleaning

In [None]:
# Finding the missing values
df.isnull().sum()

Clearly, There is no missing values in the data

In [None]:
# Checking the data types of each columns in the data
df.dtypes

If we check the values in column "Budget" and "Gross" we can see that it is in "float" datatypes but there is no digits after the decimals, so we change the data type from "float" to "int"

In [None]:
df['budget']=df['budget'].astype('int64')

df['gross'] = df['gross'].astype('int64')

In [None]:
df

The year has incorrect values while we compare it with the released date column

In [None]:
# creating correct year column

df['yearcorrect'] = df['released'].astype(str).str[:4]
df

In [None]:
# changing the data type of 'yearcorrected' to int
df['yearcorrect'] = df['yearcorrect'].astype('int64')
df.dtypes

Viewing the top 5 movies with highest gross by Sorting the values by "Gross" column

In [None]:
df = df.sort_values(by=['gross'],ascending=False)
df.head()

Checking for duplicates

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.head()

## EDA and finding correaltions

In [None]:
# budeget vs gross
figure = plt.figure(figsize=(15,5))
plt.scatter(x=df['budget'],y=df['gross'],color='red')
plt.title("Budget vs Gross",size=30)
plt.xlabel("Budget",size=20)
plt.ylabel("Gross",size=20)
plt.show()

In [None]:
df.head()

In [None]:
sns.regplot(x='budget',y='gross',data=df,scatter_kws={"color":"red"},line_kws={"color":"green"})

Budget and Gross are positively correlated

In [None]:
# correlations
df.corr()

There is high correlation with budget and gross

In [None]:
corr_mat = df.corr()
sns.heatmap(corr_mat,annot=True)
plt.title("Correlations",size=30)
plt.xlabel("Features")
plt.ylabel("Features")
plt.show()

In [None]:
df.head()

In [None]:
## Changing the categorical features into categories and replacing with an integer values for distinct category
df_num = df.copy()
for col in df_num.columns:
    if(df_num[col].dtype == 'object'):
        df_num[col] = df_num[col].astype('category')
        df_num[col] = df_num[col].cat.codes

df_num.head()

Now we have changed the categorical values to an individual categories with integer values

In [None]:
df.head()

In [None]:
corr_mat = df_num.corr()
sns.heatmap(corr_mat,annot=True)
plt.title("Correlations",size=30)
plt.xlabel("Features")
plt.ylabel("Features")
plt.show()

In [None]:
cor_matrix = df_num.corr()
corr_pairs=cor_matrix.unstack()
corr_pairs

In [None]:
pd.set_option("display.max_rows",None)

In [None]:
sorted_pairs = corr_pairs.sort_values()
sorted_pairs

## Columns with high correlatinos

In [None]:
sorted_pairs[sorted_pairs>0.5]

Votes and Budgets has the higest correlations for the gross and company has low correlations

## Exploratory Data Analysis

In [None]:
df.head()

### Company

In [None]:
com = df.groupby(['name','company'])['budget','gross'].sum().sort_values(by='gross',ascending=False)
com.head(10)

In [None]:
com=com.reset_index()
com.to_csv("Name and company with budget and gross.csv",index=False)
com

In [None]:
top10com=com.head(10)
top10com

In [None]:
top10com.plot(x='name',y=['budget','gross'],kind='bar',figsize=(12,5))
plt.title("Movies with Budget and gross",size=30)
plt.xlabel("Movies",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
com = df.groupby(['company'])['budget','gross'].sum().sort_values(by='gross',ascending=False)
com.head(10)

In [None]:
top10c = com.reset_index()
top10c.head()

In [None]:
top10c.to_csv('Companies with highest budget and gross.csv',index=False)

In [None]:
top10c = top10c.head()
top10c

In [None]:
top10c.plot(x='company',y=['budget','gross'],kind='bar',figsize=(12,5))
plt.title("Companies with Budget and gross",size=30)
plt.xlabel("Companies",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
df.head()

### Coutries with respective of Budgets and gross

In [None]:
country_counts=df['country'].value_counts()
country_counts=country_counts.to_frame()
country_counts=country_counts.reset_index()

In [None]:
country_counts.rename(columns={'index':'country','country':'count'},inplace=True)

In [None]:
country_counts.to_csv("Count of country.csv",index=False)

In [None]:
top10country=top10country.to_frame()
top10country=top10country.reset_index()


In [None]:
top10country

In [None]:
top10country.rename(columns={'index':'country','country':'No. of flims'},inplace=True)
top10country

In [None]:
fig = plt.figure(figsize=(12,8))
plt.pie(top10country['No. of flims'],labels=top10country['country'])
plt.title("Country with highest releases",size=30)
plt.show()

In [None]:
bgcountry=df.groupby(['country'])['budget','gross'].sum().sort_values(by=['gross'],ascending=False)
bgcountry

In [None]:
bgcountry.reset_index(inplace=True)

In [None]:
bgcountry.to_csv("country and their budget and gross.csv",index=False)

In [None]:
top10bgcountry = bgcountry.head(10)
top10bgcountry.reset_index(inplace=True)

In [None]:
top10bgcountry.to_csv("Country wise gross and budget.csv",index=False)
top10bgcountry

In [None]:
plt.plot(top10bgcountry['country'],top10bgcountry['budget'],top10bgcountry['country'],top10bgcountry['gross'])
plt.title("Country with Budget and gross",size=30)
plt.xlabel("Country",size=15)
plt.ylabel("Amount",size=15)
plt.legend()
plt.show()

In [None]:
df.head()

### Genres vs gross

In [None]:
genre_gross=df.groupby(['genre'])['gross'].sum()
genre_gross=genre_gross.to_frame()
genre_gross.reset_index(inplace=True)

In [None]:
genre_gross.to_csv("Genre vs gross.csv",index=False)
genre_gross

In [None]:
fig = plt.figure(figsize=(18,8))
plt.bar(genre_gross['genre'],genre_gross['gross'])
plt.title("Genre vs Gross",size=30)
plt.xlabel("Genres",size=15)
plt.ylabel("Amount",size=15)
plt.show()

In [None]:
sns.pairplot(df,diag_kind='hist')

In [None]:
for col in df.columns:
    if df[col].dtype !="object":
        sns.distplot(df[col],bins=20)
        plt.show()

In [None]:
df.head()

### Counts dataframe

In [None]:
df[df['votes']==df['votes'].max()]

In [None]:
df[df['votes']==df['votes'].max()].name


In [None]:
lst = []
len(df['director'].unique())
df['budget'].sum()

In [None]:
lst=({'No. of directors':len(df['director'].unique()),
      'No. of movies':len(df['name'].unique()),
      'Total budget':df['budget'].sum(),
      'Total gross':df['gross'].sum(),
      'Movie with higest votes':'The Shawshank Redemption'})
    


In [None]:
lst

In [None]:
header=pd.DataFrame(lst,index={'0'})

In [None]:
header.reset_index()

In [None]:
header.to_csv("Header.csv",index=False)