In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-darkgrid')

# 1. Loading Data & Editing Some Infos

In [None]:
df = pd.read_csv("../input/imdb-top-250-lists-1996-2020/imdbTop250.csv")

df.head()

In [None]:
df.IMDBlink.nunique()

In [None]:
df.Title.nunique()

In [None]:
df.drop_duplicates(["IMDBlink"])["Title"].value_counts()

In [None]:
df[df.Title == "Drishyam"]

We have two movie with the same name, Drishyam.

In [None]:
df['Genre'] = df['Genre'].apply(lambda x: x.split(','))
df

Seperating column Genre

In [None]:
gen = pd.DataFrame(df['Genre'].values.tolist(), columns=['Genre1', 'Genre2', 'Genre3'], index = df['Title'])
fin = df.merge(gen, on = 'Title').drop_duplicates(['IMDByear', 'Title', 'Rating'])
fin.drop("Genre", axis = 1, inplace = True)
df = fin.sort_values(['IMDByear', 'Ranking'])
df

In [None]:
remove_whitespaces = ["Genre1", "Genre2", "Genre3", "Cast1", "Cast2", "Cast3", "Cast4"]

for col in remove_whitespaces:
    df[col] = df[col].str.strip()

Getting unique genres and stars,

Editing long strings for better visualization

In [None]:
g1 = df['Genre1'].unique().tolist()
g2 = df['Genre2'].unique().tolist()
g3 = df['Genre2'].unique().tolist()
genre = set(g1 + g2 + g3)
genre = [x for x in genre if str(x) != 'nan']

c1 = df['Cast1'].unique().tolist()
c2 = df['Cast2'].unique().tolist()
c3 = df['Cast3'].unique().tolist()
c4 = df['Cast4'].unique().tolist()
cast = set(c1 + c2 + c3 + c4)
cast = [x for x in cast if str(x) != 'nan']

df['Director'] = df['Director'].str.replace('Ethan Coen,  Joel Coen', 'Joel Coen,  Ethan Coen')
df['Title'] = df['Title'].str.replace('The Lord of the Rings', 'LOTR')
df["Title"] = df["Title"].str.replace(": Episode", "")
df["Title"] = df["Title"].str.replace(" or: How I Learned to Stop Worrying and Love the Bomb", "")

df.head()

In [None]:
round(df.describe(), 2)

# 2. Some Queries

**How many different movie was entered in IMDB Top250 lists from 1996 to 2020?**

In [None]:
df.Title.nunique()

**Movies that are always in the IMDB Top250 lists from 1996 to 2020?**

In [None]:
all_time = []
movies = df.Title.unique().tolist()

for mov in movies:
    if df[df.Title == mov].shape[0] == 25:
        all_time.append(mov)

print(len(all_time))
all_time[:10]

In [None]:
file = open('all_time_in_list_96-20.txt','w')

for element in all_time:
    file.write(element)
    file.write('\n')
file.close()

**Movies that are in the IMDB Top250 lists for the past decade:**

In [None]:
last10 = []

for mov in movies:
    if df[(df.IMDByear > 2010) & (df.Title == mov)].shape[0] == 10:
        last10.append(mov)
        
print(len(last10))
last10[-10:]

In [None]:
file = open('always_in_list_decade.txt','w')

for element in all_time:
    file.write(element)
    file.write('\n')
file.close()

**Movies that are in the IMDB Top250 lists for the last 5 years:**

In [None]:
last5 = []

for mov in movies:
    if df[(df.IMDByear > 2015) & (df.Title == mov)].shape[0] == 5:
        last5.append(mov)
        
print(len(last5))
last5[-10:]

In [None]:
file = open('always_in_list_last5_year.txt','w')

for element in all_time:
    file.write(element)
    file.write('\n')
file.close()

**Movies that are come out and enter the list same year:**

In [None]:
vis = df[df['IMDByear'] == df['Date']].sort_values('Ranking')
print(len(vis.Title.tolist()))

vis.Title.tolist()[:10]

### Directors that have 3 movies in the IMDB Top 250 Lists From 1996 to 2020

In [None]:
direc = []
directors = df.Director.unique().tolist()

for director in directors:
    temp = df[df.Director == director].groupby("IMDByear").Title.count()
    if temp[temp >= 3].shape[0] == 25:
        direc.append(director)
        
len(direc)

In [None]:
direc

### Stars that have 3 movies in the IMDB Top 250 Lists From 1996 to 2020

In [None]:
stars = []

for star in cast:
    temp = df[(df.Cast1 == star) | (df.Cast2 == star) | (df.Cast3 == star) | (df.Cast4 == star)]\
    .groupby("IMDByear").Title.count()
    if temp[temp >= 3].shape[0] == 25:
        stars.append(star)
        
len(stars)

In [None]:
stars

# 3. Simple Graphs

We can increase the number of examples. These are just basic examples.

In [None]:
plt.figure(figsize = (14, 8))

metascore, = plt.plot((df.groupby(['IMDByear'])['Score'].mean()) / 10, color = 'red') #1-10 arasına indirgemek için
imdbrating, = plt.plot(df.groupby(['IMDByear'])['Rating'].mean(), color = 'blue')

plt.legend([metascore, imdbrating], ["Metascore", "IMDB Rating"])
plt.title("Metascore vs. IMDB Ratings")
plt.xlabel('IMDB Top250 Year')
plt.ylabel('Rating')
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize = (10, 6))
plt.plot(df.groupby(['Date'])['Title'].nunique())
plt.xticks(rotation = 90)
plt.xlabel('Years')
plt.ylabel('Number of Movies')
plt.title("Number of Movies by Release Year  \n (for the movies that are in the list from 1996 to 2020)")
plt.show()

In [None]:
plt.figure(figsize = (10, 6))
plt.plot(df.groupby(['IMDByear'])['Director'].nunique())
plt.xticks(rotation = 90)
plt.xlabel('Year of IMDB Top250 List')
plt.ylabel('# Directors')
plt.title("How many Different Director's movie in the IMDB Top250 List Over Years?")
plt.show()

In [None]:
plt.figure(figsize = (10, 6))
plt.plot(df.groupby(['IMDByear'])['Gross'].mean())
plt.xticks(rotation = 90)
plt.xlabel('Year of IMDB Top250 List')
plt.ylabel('Average Gross (Million $)')
plt.title('Average Gross for IMDB Top250 Lists')
plt.show()

# 4. A Little Detailed Visualizations

### For Directors

**Directors that have most movies in the IMDB Top250 Lists**

In [None]:
num_movies, title_movies = [], []
for director in directors:
    
    temp = df[df.Director == director]
    num_movies.append(temp.Title.nunique())
    title_movies.append(str(temp.Title.unique()))
    
director_movies = pd.DataFrame({"Director": directors, "Number of Movies": num_movies, "Movies": title_movies})

director_movies["Movies"] = director_movies["Movies"].str.replace("[", "").str.replace("]", "").str.replace("\n", "")
director_movies['Movies'] = [''.join(map(str, l)) for l in director_movies['Movies']]

director_movies = director_movies.sort_values("Number of Movies", ascending = False)
director_movies.head(15)

In [None]:
plt.rcdefaults()
# plt.style.use('dark_background')

plt.rcParams['figure.facecolor'] = "#C6C6C6"
fig, ax = plt.subplots(figsize = (20, 10))
ax.set_facecolor("#C6C6C6")
sns.barplot(x = 'Number of Movies', y = 'Director', data = director_movies.head(30), palette = "GnBu_r")

plt.title("Directors that have most movies in the IMDB Top250 Lists \n (for the lists 1996-2020)")
plt.xlabel("Number of Movies in the Top250")

for rect, label in zip(ax.patches, director_movies.Movies):
    
    bl = rect.get_xy()
    x = 0.005 * rect.get_width() + bl[0]
    y = 0.60 * rect.get_height() + bl[1]
    
    ax.text(x, y, label, ha = 'left', va = 'baseline', rotation = 'horizontal', fontsize = 7)

sns.despine(bottom = True, left = True)
plt.show()

### For Stars

**Stars that have most movies in the IMDB Top250 Lists:**

In [None]:
num_movies, title_movies = [], []
for star in cast:
    
    temp = df[(df.Cast1 == star)|(df.Cast2 == star)|(df.Cast3 == star)|(df.Cast4 == star)]
    num_movies.append(temp.Title.nunique())
    title_movies.append(str(temp.Title.unique()))
    
performer_movies = pd.DataFrame({"Star": cast, "Number of Movies": num_movies, "Movies": title_movies})

performer_movies["Movies"] = performer_movies["Movies"].str.replace("[", "").str.replace("]", "").str.replace("\n", "")
performer_movies['Movies'] = [''.join(map(str, l)) for l in performer_movies['Movies']]

performer_movies = performer_movies.sort_values("Number of Movies", ascending = False)
performer_movies.head(15)

In [None]:
plt.rcdefaults()
# plt.style.use('dark_background')

plt.rcParams['figure.facecolor'] = "#353535"

fig, ax = plt.subplots(figsize = (20, 10))

ax.set_facecolor("#353535")

sns.barplot(x = 'Number of Movies', y = 'Star', data = performer_movies.head(30), palette = "YlOrRd_r")
plt.title("Stars that have most movies in the IMDB Top250 Lists \n (for the lists 1996 - 2020)", 
#           color = "#A61C00", fontweight = "bold"
         )
plt.xlabel("Number of Movies in the Top250 List")
# plt.yticks(color = "#A61C00", fontweight = "bold", fontsize = "small")
# plt.xticks(color = "#A61C00", fontweight = "bold")
for rect, label in zip(ax.patches, performer_movies.Movies):
    
    bl = rect.get_xy()
    x = 0.005 * rect.get_width() + bl[0]
    y = 0.60 * rect.get_height() + bl[1]
    ax.text(x, y, label, ha = 'left', va = 'baseline', rotation = 'horizontal', fontsize = 7)

sns.despine(bottom = True, left = True)

In [None]:
print(len(all_time))
all_time[:10]

###  For Movies
**Rank Changings for the movies that are always in the IMBD Top250 List**

In [None]:
ranks, differences = [], []

for movie in all_time:
    
    temp = df[df.Title == movie]
    difference = temp.Ranking.iloc[0] - temp.Ranking.iloc[-1]
    differences.append(difference)

changes = pd.DataFrame({"Movie": all_time, "Change": differences}).sort_values("Movie")
changes

In [None]:
total_figure = len(all_time)
# total_cols = round(total_figure ** 0.5)
total_cols = 8

total_rows = total_figure // total_cols
total_rows += total_figure % total_cols 

position = range(1, total_figure + 1)

years = df.IMDByear.unique().tolist()

plt.style.use("dark_background")

fig = plt.figure(1, figsize=(30, 25))

fig.suptitle('Rank Changings for the Movies that are always in the IMBD Top250 \n', fontsize = 24)
fig.text(0.76, 0.33, 
         "*These are the movies that are always in IMDB Top 250 Lists from 1996 to 2020." + 
         "\n\n*For a movie,\n     If its rank in 2020 is better than the rank in 1996, color of its graph is blue i.e. 12 Angry Men" +
         "\n     Else, its color is orange i.e. 2001: A Space Odyssey" +
         "\n\n*Movies were listed in alphabetical order.",
         style = 'italic',
         color = "#8A8C8B")

fig.text(0.91, 0.33, "Graph: kaggle.com/mustafacicek", color = "#444544")

for movie, k in list(zip(changes.Movie, range(total_figure))):
    
    temp = df[df.Title == movie]
    ranks = temp.Ranking.tolist()
    
    ax = fig.add_subplot(total_rows, total_cols, position[k])
    
    if int(changes[changes.Movie == movie]["Change"]) < 0:
        ax.plot(years, ranks, color = '#F1920E')
        
    else:
        ax.plot(years, ranks, color = '#0E6DF1')
    
    ax.set_title(movie)        

plt.tight_layout()  
plt.show()