In [20]:
import pandas as pd

In [21]:
df = pd.read_csv('data/input/IMDB-Movie-Data.csv')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [23]:
df.describe()

Unnamed: 0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
count,1000.0,1000.0,1000.0,1000.0,1000.0,872.0,936.0
mean,500.5,2012.783,113.172,6.7232,169808.3,82.956376,58.985043
std,288.819436,3.205962,18.810908,0.945429,188762.6,103.25354,17.194757
min,1.0,2006.0,66.0,1.9,61.0,0.0,11.0
25%,250.75,2010.0,100.0,6.2,36309.0,13.27,47.0
50%,500.5,2014.0,111.0,6.8,110799.0,47.985,59.5
75%,750.25,2016.0,123.0,7.4,239909.8,113.715,72.0
max,1000.0,2016.0,191.0,9.0,1791916.0,936.63,100.0


In [24]:
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


### Challenge 1. Using a single argument
We want to create bins of movies according to the number of votes they've received. For that matter, we will create a new column named 'bin' which will tag every movie as follow:

- From 0 to 999 ==> 'cat_1'
- From 1000 to 9999 ==> 'cat_2'
- From 10000 to 99999 ==> 'cat_3'
- From 100000 to 999999 ==> 'cat_4'
- More than 1000000 ==> 'cat_5'

In [25]:
def warm_up(x):
    if x >= 0 and x <= 999:
        return "Cat 1"
    elif x >= 1000 and x <= 9999:
        return "Cat 2"
    elif x >= 10000 and x <= 99999:
        return "Cat 3"
    elif x >= 100000 and x <= 999999:
        return "Cat 4"
    elif x >= 1000000:
        return "Cat 5"

In [26]:
df['bin'] = df.apply(lambda x: warm_up(x['Votes']), axis=1)

In [27]:
df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,Cat 4
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,Cat 4
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,Cat 4
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,Cat 3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,Cat 4


### Challenge 2. Using two arguments
We want to know how much is the revenue per minute for every movie.

In [28]:
df['Revenue per minutes'] = df.apply(lambda row: row['Revenue (Millions)'] / row['Runtime (Minutes)'], axis=1)

In [29]:
df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,Cat 4,2.75314
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,Cat 4,1.019839
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,Cat 4,1.180513
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,Cat 3,2.502963
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,Cat 4,2.642439


### Challenge 3. A bit more complicated
We want to create a new rating where we add 1 point if the genre is thriller but subtract 1 point if the genre is comedy.

In [30]:
def new_rating_genre(x,y): # x para Genre, y para Rank
    if 'Thriller' in x:
        y += 1
    elif 'Comedy' in x:
        y -= 1
    return y

In [31]:
df['New Rating'] = df.apply(lambda row: new_rating_genre(row['Genre'], row['Rank']), axis=1)

In [32]:
df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes,New Rating
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,Cat 4,2.75314,1
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,Cat 4,1.019839,2
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,Cat 4,1.180513,4
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,Cat 3,2.502963,3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,Cat 4,2.642439,5


### Challenge 4. A bit too weird...
We want to know whether the integer part of the number resulting from the sum of the ASCII value of every character of the movie title divided by the number of votes, is a prime number (remember that prime numbers are integers).

In [33]:
# Código ASCII: https://elcodigoascii.com.ar/

# Your code here (https://docs.python.org/3/library/functions.html#ord) & (https://foro.elhacker.net/scripting/pythonsumar_valor_numerico_de_cada_caracter_de_una_cadenaascii-t338102.0.html)

# Pseudocode: Ej. 13 se divide entre 1 y así mismo

# Guardians of the Galaxy(sum(ASCII characters) / number_of_votes) = prime_number

 # x for title
def sum_ascii(x):
    sum_char = 0
    for char in x:
        sum_char += ord(char)
    return sum_char
# sum_ascii('abc')

In [39]:
# Weird results

division = df.apply(lambda row: int(sum_ascii(row['Title']) / row['Votes']), axis=1)
print(division)

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Length: 1000, dtype: int64


In [63]:
def prime(number):
    if type(number) == float:
        number = int(number)
    if number < 2:
        return False
    for i in range(2, number):
        if number % i == 0:
            return False
    return True

In [66]:
df['Is prime?'] = division.apply(lambda division_result: 'Yes' if prime(division_result) else 'No')

In [68]:
df[df['Is prime?'] == 'Yes'].head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes,New Rating,Is prime?
44,45,Lowriders,Drama,A young street artist in East Los Angeles is c...,Ricardo de Montreuil,"Gabriel Chavarria, Demián Bichir, Theo Rossi,T...",2016,99,6.3,279,4.21,57.0,Cat 1,0.042525,45,Yes
68,69,Wakefield,Drama,A man's nervous breakdown causes him to leave ...,Robin Swicord,"Bryan Cranston, Jennifer Garner, Beverly D'Ang...",2016,106,7.5,291,0.01,61.0,Cat 1,9.4e-05,69,Yes
112,113,The Bad Batch,"Romance,Sci-Fi",A dystopian love story in a Texas wasteland an...,Ana Lily Amirpour,"Keanu Reeves, Jason Momoa, Jim Carrey, Diego Luna",2016,118,6.1,512,,65.0,Cat 1,,113,Yes
293,294,The Exception,Drama,A German soldier tries to determine if the Dut...,David Leveaux,"Lily James, Jai Courtney, Christopher Plummer,...",2016,107,7.7,96,,,Cat 1,,294,Yes
307,308,Vincent N Roxxy,"Crime,Drama,Thriller",A small town loner and a rebellious punk rocke...,Gary Michael Schultz,"Emile Hirsch, Zoë Kravitz, Zoey Deutch,Emory C...",2016,110,5.5,403,,,Cat 1,,309,Yes


### Challenge 5. And finally some fantasy
Feel free to propose your own ranking based in aggregations of at least 3 columns of the dataset.

In [77]:
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes,New Rating,Is prime?
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,Cat 4,2.75314,1,No
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,Cat 4,1.019839,2,No
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,Cat 4,1.180513,4,No
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,Cat 3,2.502963,3,No
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,Cat 4,2.642439,5,No


In [None]:
'''
Categorías:

1. Oppenheimer: 926 Million Dollars
2. Barbie: 1380 Million Dollars
'''

### Bonus challenge. Freaky bonus
We want to know which movies might have hidden paterns in their description. A way to know that is finding those movies which the sum of all numeric values of the string description hash (SHA256) are between their revenue and their number of votes.

In [None]:
# Your code here