In [1]:
import pandas as pd
import operator 

In [2]:
data = pd.read_csv('data.csv')

# Data preparation

In [3]:
def get_season(month):
    if month == 12 or month == 1 or month == 2:
        return 'winter'
    elif month == 3 or month == 4 or month == 5:
        return 'spring'
    elif month == 6 or month == 7 or month == 8:
        return 'summer'
    else:
        return 'fall'

data["profit"] = data["revenue"] - data["budget"]
data["release_month"] = data['release_date'].apply(lambda x: int(x.split('/')[0]))
data["release_season"] = data['release_month'].apply(get_season)
data['profitable'] = data['profit'].apply(lambda x: True if x > 0 else False)

# 1. What movie has the biggest budget?

In [4]:
data[data.budget == data.budget.max()]["original_title"]

491    The Warrior's Way
Name: original_title, dtype: object

# 2. Which movie is the longest (in minutes)?

In [5]:
data[data.runtime == data.runtime.max()]["original_title"]

1158    Gods and Generals
Name: original_title, dtype: object

# 3. Which movie is the shortest (in minutes)?

In [6]:
data[data.runtime == data.runtime.min()]["original_title"]

769    Winnie the Pooh
Name: original_title, dtype: object

# 4. What is the mean runtime of the movies?

In [7]:
data.runtime.mean()

109.65343915343915

# 5. What is the median runtime of the movies?




In [8]:
data.runtime.median()

106.5

# 6. Which movie is the most profitable?

In [9]:
data.query('profit==profit.max()')["original_title"]

239    Avatar
Name: original_title, dtype: object

# 7. Which movie is the biggest box office bomb?

In [10]:
data.query('profit==profit.min()')["original_title"]

491    The Warrior's Way
Name: original_title, dtype: object

# 8. How many movies are profitable?


In [11]:
len(data.query('profitable').index)

1478

# 9. Which movie was the most profitable in 2008

In [12]:
mov_2008 = data.query('release_year==2008')
mov_2008.query('profit==profit.max()')["original_title"]

600    The Dark Knight
Name: original_title, dtype: object

# 10. Which movie was the biggest box office bomb of the years 2012-2014?

In [13]:
mov_12_14 = data.query('2012<=release_year<=2014')
mov_12_14.query('profit==profit.min()')['original_title']

1246    The Lone Ranger
Name: original_title, dtype: object

# 11. Movies of which genre are the most frequent in the dataset?

In [14]:
genres = {}
for genre_list in data["genres"]:
    for genre in genre_list.split('|'):
        if genre in genres.keys():
            genres[genre] += 1
        else:
            genres[genre] = 1

sorted_genres = sorted(genres.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_genres[i])

('Drama', 782)
('Comedy', 683)
('Thriller', 597)
('Action', 583)
('Adventure', 416)


# 12. Which genre is the most frequent among the profitable movies?

In [15]:
profit_movies = data.query('profitable')

genres = {}
for genre_list in profit_movies["genres"]:
    for genre in genre_list.split('|'):
        if genre in genres.keys():
            genres[genre] += 1
        else:
            genres[genre] = 1

sorted_genres = sorted(genres.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_genres[i])

('Drama', 560)
('Comedy', 551)
('Thriller', 446)
('Action', 444)
('Adventure', 337)


# 13. Which director has filmed the most movies?

In [16]:
directors = {}

for director_list in data["director"]:
    for director in director_list.split('|'):
        if director in directors.keys():
            directors[director] += 1
        else:
            directors[director] = 1
    
sorted_directors = sorted(directors.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_directors[i])

('Steven Soderbergh', 13)
('Ridley Scott', 12)
('Clint Eastwood', 12)
('Robert Rodriguez', 11)
('Steven Spielberg', 10)


# 14. Which director has filmed the most profitable movies?

In [17]:
profit_movies = data.query('profitable')
directors = {}

for director_list in profit_movies["director"]:
    for director in director_list.split('|'):
        if director in directors.keys():
            directors[director] += 1
        else:
            directors[director] = 1

sorted_directors = sorted(directors.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_directors[i])

('Ridley Scott', 12)
('Steven Spielberg', 10)
('Clint Eastwood', 10)
('Steven Soderbergh', 10)
('Shawn Levy', 9)


# 15. Which director has brought the most profit?

In [18]:
directors = {}

for index, director_list in enumerate(data["director"]):
    for director in director_list.split('|'):
        if director in directors.keys():
            directors[director] += data.loc[index]['profit']
        else:
            directors[director] = data.loc[index]['profit']
            
sorted_directors = sorted(directors.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_directors[i])

('Peter Jackson', 5202593685)
('David Yates', 3379295625)
('Christopher Nolan', 3162548502)
('J.J. Abrams', 2839169916)
('Michael Bay', 2760938960)


# 16. Which actor has brought the most profit?

In [19]:
actors = {}

for index, cast in enumerate(data['cast']):
    for actor in cast.split('|'):
        if actor in actors.keys():
            actors[actor] += data.loc[index]['profit']
        else:
            actors[actor] = data.loc[index]['profit']
            
sorted_actors = sorted(actors.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_actors[i])

('Emma Watson', 6666245597)
('Daniel Radcliffe', 6514990281)
('Rupert Grint', 6408638290)
('Ian McKellen', 6087375777)
('Robert Downey Jr.', 5316030161)


# 17. Which actor has brought the least profit in 2012?

In [22]:
mov_12 = data.query('release_year==2012')
actors = {}

for index, cast in enumerate(mov_12['cast']):
    for actor in cast.split('|'):
        if actor in actors.keys():
            actors[actor] += mov_12.iloc[index]['profit']
        else:
            actors[actor] = mov_12.iloc[index]['profit']

sorted_actors = sorted(actors.items(), key=operator.itemgetter(1))
for i in range(5):
    print(sorted_actors[i])

('Kirsten Dunst', -68109207)
('Timothy Spall', -51893525)
('James Kidnie', -51893525)
('Heidi Hawkins', -51893525)
('Malin Ã…kerman', -43733509)


# 18. Which actor was in the cast of most movies with the budget above the mean?

In [23]:
above_avg_budget = data.query('budget>budget.mean()')
actors = {}

for index, cast in enumerate(above_avg_budget['cast']):
    for actor in cast.split('|'):
        if actor in actors.keys():
            actors[actor] += 1
        else:
            actors[actor] = 1
            
sorted_actors = sorted(actors.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_actors[i])

('Matt Damon', 18)
('Adam Sandler', 17)
('Angelina Jolie', 16)
('Tom Cruise', 15)
('Samuel L. Jackson', 15)


# 19. Movies of which genre star Nicolas Cage most frequently?

In [24]:
nick_cage = data[data.cast.str.contains('Nicolas Cage')]
genres = {}

for genre_list in nick_cage["genres"]:
    for genre in genre_list.split('|'):
        if genre in genres.keys():
            genres[genre] += 1
        else:
            genres[genre] = 1

sorted_genres = sorted(genres.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_genres[i])

('Action', 17)
('Thriller', 15)
('Drama', 12)
('Crime', 10)
('Fantasy', 8)


# 20. Which production company has filmed the largest number of movies?

In [25]:
studios = {}

for studio_list in data['production_companies']:
    for studio in studio_list.split('|'):
        if studio in studios.keys():
            studios[studio] += 1
        else:
            studios[studio] = 1
            
sorted_studios = sorted(studios.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_studios[i])

('Universal Pictures', 173)
('Warner Bros.', 168)
('Paramount Pictures', 122)
('Columbia Pictures', 117)
('Twentieth Century Fox Film Corporation', 109)


# 21. Which production company has filmed the largest number of movies in the year 2015?

In [26]:
mov_2015 = data.query('release_year==2015')
studios = {}

for studio_list in mov_2015['production_companies']:
    for studio in studio_list.split('|'):
        if studio in studios.keys():
            studios[studio] += 1
        else:
            studios[studio] = 1
            
sorted_studios = sorted(studios.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_studios[i])

('Warner Bros.', 12)
('Universal Pictures', 10)
('Twentieth Century Fox Film Corporation', 8)
('Paramount Pictures', 7)
('Columbia Pictures', 7)


# 22. Which production company had the largest profit from comedies?

In [27]:
comedies = data[data.genres.str.contains('Comedy')]
studios = {}

for index, studio_list in enumerate(comedies['production_companies']):
    for studio in studio_list.split('|'):
        if studio in studios.keys():
            studios[studio] += comedies.iloc[index]['profit']
        else:
            studios[studio] = comedies.iloc[index]['profit']
            
sorted_studios = sorted(studios.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_studios[i])

('Universal Pictures', 8961545581)
('Walt Disney Pictures', 7669710326)
('Twentieth Century Fox Film Corporation', 5686960294)
('Columbia Pictures', 5646343696)
('DreamWorks Animation', 4789049764)


# 23. Which production company was the most profitable in the year 2012?

In [28]:
mov_12 = data.query('release_year==2012')
studios = {}

for index, studio_list in enumerate(mov_12['production_companies']):
    for studio in studio_list.split('|'):
        if studio in studios.keys():
            studios[studio] += mov_12.iloc[index]['profit']
        else:
            studios[studio] = mov_12.iloc[index]['profit']
            
sorted_studios = sorted(studios.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_studios[i])

('Columbia Pictures', 2501406608)
('Universal Pictures', 1981011579)
('Marvel Studios', 1299557910)
('Warner Bros.', 1258020056)
('Relativity Media', 1032593938)


# 24. Which movie produced by "Paramount Pictures" was the biggest box office bomb?

In [29]:
paramount = data[data.production_companies.str.contains('Paramount Pictures')]

paramount.query('profit==profit.min()')['original_title']

926    K-19: The Widowmaker
Name: original_title, dtype: object

# 25. Which year was the most profitable overall?

In [30]:
years = {}

for index, year in enumerate(data['release_year']):
    if year in years.keys():
        years[year] += data.iloc[index]['profit']
    else:
        years[year] = data.iloc[index]['profit']

sorted_years = sorted(years.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_years[i])

(2015, 18668572378)
(2014, 16397812953)
(2012, 16077001687)
(2013, 15243179791)
(2011, 14730241341)


# 26. Which year was the most profitable for the production company "Warner Bros."?

In [31]:
warner = data[data.production_companies.str.contains('Warner Bros')]
years = {}

for index, year in enumerate(warner['release_year']):
    if year in years.keys():
        years[year] += warner.iloc[index]['profit']
    else:
        years[year] = warner.iloc[index]['profit']

sorted_years = sorted(years.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_years[i])

(2014, 2295464519)
(2007, 2201675217)
(2008, 2134595031)
(2010, 1974712985)
(2011, 1871393682)


# 27. During which month has the largest number of movies come out?

In [32]:
months = {} 

for month in data['release_month']:
    if month in months.keys():
        months[month] += 1
    else:
        months[month] = 1

sorted_months = sorted(months.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_months[i])

(9, 227)
(12, 191)
(10, 186)
(8, 161)
(3, 156)


# 28. How many movies have come out during the summer?

In [33]:
len(data.query('release_season=="summer"').index)

450

# 29. Which director releases the largest number of movies during the winter?

In [34]:
winter = data.query('release_season=="winter"')
directors = {}

for director_list in winter["director"]:
    for director in director_list.split('|'):
        if director in directors.keys():
            directors[director] += 1
        else:
            directors[director] = 1

sorted_directors = sorted(directors.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_directors[i])

('Peter Jackson', 7)
('Clint Eastwood', 6)
('Steven Soderbergh', 6)
('Shawn Levy', 4)
('Nancy Meyers', 4)


# 30. Which month is the most profitable in a year most frequently?

In [35]:
months_total = {}
years = []
by_year = {}

for year in data['release_year']:
    if year not in years:
        years.append(year)

for year in years:
    if year not in by_year.keys():
        by_year[year] = data.query(f'release_year=={year}')
        
for df in by_year.values():
    months_temp = {}
    for index, month in enumerate(df['release_month']):
        if month in months_temp.keys():
            months_temp[month] += df.iloc[index]['profit']
        else:
            months_temp[month] = df.iloc[index]['profit']
            
    most_profitable_month = max(iter(months_temp.items()), key=operator.itemgetter(1))[0]
    
    if most_profitable_month in months_total.keys():
        months_total[most_profitable_month] += 1
    else:
        months_total[most_profitable_month] = 1
    
sorted_months = sorted(months_total.items(), key=operator.itemgetter(1), reverse=True)
for i in range(len(sorted_months)):
    print(sorted_months[i])
    

(6, 7)
(5, 4)
(12, 4)
(7, 1)


# 31. Which production company has the longest movie names (counting symbols)?

In [36]:
studios = {}

for index, studio_list in enumerate(data['production_companies']):
    for studio in studio_list.split('|'):
        if studio in studios.keys():
            studios[studio][0] += len(data.iloc[index]['original_title'])
            studios[studio][1] += 1
        else:
            studios[studio] = []
            studios[studio].append(len(data.iloc[index]['original_title']))
            studios[studio].append(1)

studio_avg = {}

for studio in studios:
    studio_avg[studio] = studios[studio][0]/studios[studio][1]

sorted_studios = sorted(studio_avg.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_studios[i])

('Four By Two Productions', 83.0)
('Jim Henson Company, The', 59.0)
('Dos Corazones', 47.0)
('Museum Canada Productions', 46.0)
('Polsky Films', 46.0)


# 32. Which production company has the longest movie names (counting words)?

In [37]:
studios = {}

for index, studio_list in enumerate(data['production_companies']):
    for studio in studio_list.split('|'):
        if studio in studios.keys():
            studios[studio][0] += len(data.iloc[index]['original_title'].split())
            studios[studio][1] += 1
        else:
            studios[studio] = []
            studios[studio].append(len(data.iloc[index]['original_title'].split()))
            studios[studio].append(1)

studio_avg = {}

for studio in studios:
    studio_avg[studio] = studios[studio][0]/studios[studio][1]

sorted_studios = sorted(studio_avg.items(), key=operator.itemgetter(1), reverse=True)
for i in range(5):
    print(sorted_studios[i])

('Four By Two Productions', 12.0)
('Jim Henson Company, The', 10.0)
('Polsky Films', 9.0)
('The Saul Zaentz Company', 9.0)
('Dos Corazones', 9.0)


# 33. How many different words have been used in movie names?

In [38]:
words = []

for title in data['original_title']:
    for word in title.lower().split():
        if word not in words:
            words.append(word)
            
print(len(words))

2461


# 35. Which actors star together most frequently?

In [39]:
actors = {}

for cast in data['cast']:
    for actor in cast.split('|'):
        if actor not in actors.keys():
            actors[actor] = {}

for index, actor in enumerate(actors):
    cast_without_actor = []
    temp_data = data[data.cast.str.contains(actor)]
    for index, row in temp_data.iterrows():
        temp_cast = row['cast'].split('|')
        if actor in temp_cast:
            temp_cast.remove(actor)
            cast_without_actor += temp_cast
    
    for coactor in cast_without_actor:
        if coactor in actors[actor].keys():
            actors[actor][coactor] += 1
        else:
            actors[actor][coactor] = 1

max_coactorship = 0
actor_pair = ('','')

for actor in actors.keys():
    for coactor in actors[actor].keys():
        if actors[actor][coactor] > max_coactorship:
            max_coactorship = actors[actor][coactor]
            actor_pair = actor, coactor

print(max_coactorship)
actor_pair

8


('Daniel Radcliffe', 'Rupert Grint')

# 36. Which director (of the ones listed below) has the biggest chance to release a profitable movie?
1. Quentin Tarantino
2. Steven Soderbergh
3. Robert Rodriguez
4. Christopher Nolan
5. Clint Eastwood

In [40]:
directors = {}

for index, director_list in enumerate(data['director']):
    for director in director_list.split('|'):
        if director in directors.keys():
            if data.iloc[index]['profit'] > 0:
                directors[director][0] += 1
            else:
                directors[director][1] += 1
        else:
            directors[director] = []
            if data.iloc[index]['profit'] > 0:
                directors[director].append(1)
                directors[director].append(0)
            else:
                directors[director].append(0)
                directors[director].append(1)
    
directors_avg = {}

for director in directors.keys():
    directors_avg[director] = directors[director][0]/(directors[director][0]+directors[director][1])
    
print(f'Quentin Tarantino: {directors_avg["Quentin Tarantino"]*100}%')
print(f'Steven Soderbergh: {directors_avg["Steven Soderbergh"]*100}%')
print(f'Robert Rodriguez: {directors_avg["Robert Rodriguez"]*100}%')
print(f'Christopher Nolan: {directors_avg["Christopher Nolan"]*100}%')
print(f'Clint Eastwood: {directors_avg["Clint Eastwood"]*100}%')

Quentin Tarantino: 85.71428571428571%
Steven Soderbergh: 76.92307692307693%
Robert Rodriguez: 72.72727272727273%
Christopher Nolan: 100.0%
Clint Eastwood: 83.33333333333334%
