In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('IMDB-Movie-Data.csv')

### 1. Display top 10 rows of the dataset

In [None]:
data.head(10) #if we write data.head() it will show us the first 5 rows by default

### 2. Display last 10 rows of the dataset

In [None]:
data.tail(10) #we can also check from the output how many entries we have in our dataset (here we have 1000)

### 3. Find shape of our dataset (number of rows and number of columns)

In [None]:
data.shape #shape is from pandas dataframe

In [None]:
print('Number of Rows available in our dataset:',data.shape[0])
print('Number of Columns available in our dataset:',data.shape[1])

### 4. Getting information about our dataset (like total number rows, total number of columns, datatypes of each column and memory requirement)

In [None]:
data.info()

### 4. Check missing (null) values in the dataset

In [None]:
print('Any missing value?',data.isnull().values.any()) #answer is true which means that our dataset contains some missing values

In [None]:
data.isnull() #we have to check which columns have missing values

In [None]:
data.isnull().sum() #we want to find sum of true values

In [None]:
sns.heatmap(data.isnull()) #we can also visualize the missing values using seaborn's heatmap, the missing values are with the lighter color

In [None]:
per_missing = data.isnull().sum()*100/len(data)
print(per_missing) #we see that around 13% values are missing in the 'Revenue' column and 6.4% in the 'Metascore' column

### 6. Drop all the missing values

In [None]:
data.dropna(axis=0)

### 7. Check for duplicate data

In [None]:
dup_data=data.duplicated().any()

In [None]:
print('There are duplicated values:', dup_data) #false means we have no duplicated data!!

In [None]:
data = data.drop_duplicates()
print(data)

### 8. Get overall statistics about the dataframe

In [None]:
data.describe() #we have the statistics about our dataset (this by default is about all numerical columns only, otherwise we have the command bellow.)

In [None]:
data.describe(include='all')

### 9. Display title of the movie having runtime >= 180 minutes

In [None]:
print(data.columns) #or just data.columns

In [None]:
data[data['Runtime (Minutes)'] >= 180] #we don't need to write print

In [None]:
data[data['Runtime (Minutes)'] >= 180] ['Title'] #because we ONLYneed the title of the movie, not all the information! we can also write print(data[data['Runtime (Minutes)'] >= 180] ['Title'])

### 10. In which year there was the highest average voting?

In [None]:
data.columns #no need to write print because we are using jupyter!

In [None]:
data.groupby('Year') ['Votes'].mean()

In [None]:
data.groupby('Year') ['Votes'].mean().sort_values(ascending=False) #because by default it sorts our data in ascending order. (from the output we see that highest voting was in year 2012)

In [None]:
sns.barplot(x='Year', y='Votes', data=data) #we can visualise this using seaborn's barplot! 
plt.title('Votes by Year') #we're decorating the graph with a title
plt.show #we can use barplots to see the relationships between technical data and at least one numerical variable

### 11. In which year there was the highest average revenue? 

In [None]:
data.columns

In [None]:
data.groupby('Year') ['Revenue (Millions)'].mean().sort_values(ascending=False) #similar to step 10

In [None]:
sns.barplot(x='Year', y='Revenue (Millions)', data=data) #we can visualise this using seaborn's barplot! 
plt.title('Revenue (millions) by Year') #we're decorating the graph with a title
plt.show #from the barplot we can see that the highest revenue was in year 2009

### 12. Find the average rating for each director

In [None]:
data.columns #to know which column we need to work with! as we can see, we need to work with 'Director' and 'Rating' columnns!

In [None]:
data.groupby('Director')['Rating'].mean() #we need to find the average so we use mean() from pandas dataframe

In [None]:
data.groupby('Director')['Rating'].mean().sort_values(ascending=False) #if we want to sort in ascending or descending order!

### 13.Display top 10 lengthy movies title and runtime

In [None]:
 data.columns

In [None]:
data.nlargest(10,'Runtime (Minutes)')

In [None]:
data.nlargest(10,'Runtime (Minutes)')[['Title','Runtime (Minutes)']]

In [None]:
top10_len=data.nlargest(10,'Runtime (Minutes)')[['Title','Runtime (Minutes)']]\
.set_index('Title')

In [None]:
top10_len

In [None]:
sns.barplot(x='Runtime (Minutes)', y=top10_len.index, data=top10_len)#we can visualise it by using barplot

### 14. Display number of movies per year

In [None]:
data.columns

In [None]:
data['Year'].value_counts()

In [None]:
sns.countplot(x='Year', data=data) #lets visualize this by using seaborn's countplot

In [None]:
sns.countplot(x='Year', data=data) #lets decorate this graph
plt.title('Number of movies per year')
plt.show

### 15.Find most popular movie title (highest revenue)

In [None]:
data.columns

In [None]:
data['Revenue (Millions)'].max()

In [None]:
data['Revenue (Millions)'].max()==data['Revenue (Millions)'] #we want to compare the maximum value we found with other maximum values from Revenue column

In [None]:
data[data['Revenue (Millions)'].max()==data['Revenue (Millions)']]

In [None]:
data[data['Revenue (Millions)'].max()==data['Revenue (Millions)']] ['Title'] #we only want the movie title

### 16. Display top 10 highest rated movie title and its directors DOESNT WORK!!!

In [None]:
data.columns #first we display the column names (we see that we need to work with the 'Rating' column)

In [None]:
top10_len=data.nlargest(10,'Rating')[['Title','Rating','Director']]\
.set_index('Title') # we add this \ to change line

In [None]:
top10_len

In [None]:
sns.barplot(x='Rating', y=top10_len.index, data=top10_len) #lets visualize it

In [None]:
sns.barplot(x='Rating', y=top10_len.index, data=top10_len, hue='Director') #we are using hue parameter because we need the director as well

In [None]:
print(top10_len.columns)

### 17. Display top 10 highest revenue movie titles

In [None]:
data.columns

In [None]:
data.nlargest(10,'Revenue (Millions)')

In [None]:
data.nlargest(10,'Revenue (Millions)') ['Title']

In [None]:
data.nlargest(10,'Revenue (Millions)') [['Title', 'Revenue (Millions)']]

In [None]:
top_10=data.nlargest(10,'Revenue (Millions)') [['Title', 'Revenue (Millions)']].\
set_index('Title')

In [None]:
top_10

In [None]:
sns.barplot (x='Revenue (Millions)', y=top_10.index, data=top_10)

In [None]:
sns.barplot (x='Revenue (Millions)', y=top_10.index, data=top_10)
plt.title('Top 10 highest revenue movie titles') #lets add title to the graph

### 18. Find average rating of movies year wise

In [None]:
data.columns

In [None]:
data.groupby('Year')['Rating'].mean().sort_values(ascending=False)

### 19. Does rating affect the revenue?

In [None]:
sns.scatterplot (x='Rating', y='Revenue (Millions)', data=data) 

### 20. Classify movies based on ratings (Excellent, good and average)

In [None]:
data.columns

In [None]:
def rating(rating):
    if rating>=7.0:
        return "Excellent"
    elif rating>=6.0:
        return "Good"
    else:
        return "Average"

In [None]:
data['rating_cat']=data['Rating'].apply(rating)

In [None]:
data.head()

### 21. Count number of action movies

In [None]:
data.columns

In [None]:
data['Genre'].dtype

In [None]:
data['Genre'].str.contains('Action', case=False)

In [None]:
data[data['Genre'].str.contains('Action', case=False)]

In [None]:
len(data[data['Genre'].str.contains('Action', case=False)])

### 22. Find unique values from genre

In [None]:
data.columns

In [None]:
data['Genre']

In [None]:
list1=[]
for value in data['Genre']:
    list1.append(value.split(','))

In [None]:
list1

In [None]:
one_d=[]
for item in list1:
    for item1 in item:
        one_d.append(item1)

In [None]:
one_d

In [None]:
uni_list=[] #we want to find the unique values in the 'Genre' column
for item in one_d:
    if item not in uni_list:
        uni_list.append(item)
    

In [None]:
uni_list

In [None]:
len(uni_list) #we have 20 unique values in Genre column

### 23. How many films of each genre were made?

In [None]:
one_d=[]
for item in list1:
    for item1 in item:
        one_d.append(item1)

In [None]:
one_d

In [None]:
from collections import Counter

In [None]:
Counter(one_d)