In [158]:
import pandas as pd

In [101]:
df = pd.read_csv('data/input/IMDB-Movie-Data.csv')

In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [103]:
df.describe()

Unnamed: 0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
count,1000.0,1000.0,1000.0,1000.0,1000.0,872.0,936.0
mean,500.5,2012.783,113.172,6.7232,169808.3,82.956376,58.985043
std,288.819436,3.205962,18.810908,0.945429,188762.6,103.25354,17.194757
min,1.0,2006.0,66.0,1.9,61.0,0.0,11.0
25%,250.75,2010.0,100.0,6.2,36309.0,13.27,47.0
50%,500.5,2014.0,111.0,6.8,110799.0,47.985,59.5
75%,750.25,2016.0,123.0,7.4,239909.8,113.715,72.0
max,1000.0,2016.0,191.0,9.0,1791916.0,936.63,100.0


In [104]:
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


### Challenge 1. Using a single argument
We want to create bins of movies according to the number of votes they've received. For that matter, we will create a new column named 'bin' which will tag every movie as follow:

- From 0 to 999 ==> 'cat_1'
- From 1000 to 9999 ==> 'cat_2'
- From 10000 to 99999 ==> 'cat_3'
- From 100000 to 999999 ==> 'cat_4'
- More than 1000000 ==> 'cat_5'

In [105]:
def warm_up(x):
    if x >= 0 and x <= 999:
        return "Cat 1"
    elif x >= 1000 and x <= 9999:
        return "Cat 2"
    elif x >= 10000 and x <= 99999:
        return "Cat 3"
    elif x >= 100000 and x <= 999999:
        return "Cat 4"
    elif x >= 1000000:
        return "Cat 5"

In [106]:
df['bin'] = df.apply(lambda x: warm_up(x['Votes']), axis=1)

In [107]:
df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,Cat 4
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,Cat 4
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,Cat 4
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,Cat 3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,Cat 4


### Challenge 2. Using two arguments
We want to know how much is the revenue per minute for every movie.

In [108]:
df['Revenue per minutes'] = df.apply(lambda row: row['Revenue (Millions)'] / row['Runtime (Minutes)'], axis=1)

In [109]:
df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,Cat 4,2.75314
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,Cat 4,1.019839
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,Cat 4,1.180513
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,Cat 3,2.502963
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,Cat 4,2.642439


### Challenge 3. A bit more complicated
We want to create a new rating where we add 1 point if the genre is thriller but subtract 1 point if the genre is comedy.

In [110]:
def new_rating_genre(x,y): # x para Genre, y para Rank
    if 'Thriller' in x:
        y += 1
    elif 'Comedy' in x:
        y -= 1
    return y

In [111]:
df['New Rating'] = df.apply(lambda row: new_rating_genre(row['Genre'], row['Rank']), axis=1)

In [112]:
df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes,New Rating
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,Cat 4,2.75314,1
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,Cat 4,1.019839,2
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,Cat 4,1.180513,4
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,Cat 3,2.502963,3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,Cat 4,2.642439,5


### Challenge 4. A bit too weird...
We want to know whether the integer part of the number resulting from the sum of the ASCII value of every character of the movie title divided by the number of votes, is a prime number (remember that prime numbers are integers).

In [113]:
# Código ASCII: https://elcodigoascii.com.ar/

# Your code here (https://docs.python.org/3/library/functions.html#ord) & (https://foro.elhacker.net/scripting/pythonsumar_valor_numerico_de_cada_caracter_de_una_cadenaascii-t338102.0.html)

# Pseudocode: Ej. 13 se divide entre 1 y así mismo

# Guardians of the Galaxy(sum(ASCII characters) / number_of_votes) = prime_number

 # x for title
def sum_ascii(x):
    sum_char = 0
    for char in x:
        sum_char += ord(char)
    return sum_char

In [114]:
# Weird results

division = df.apply(lambda row: int(sum_ascii(row['Title']) / row['Votes']), axis=1)
print(division)

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Length: 1000, dtype: int64


In [115]:
def prime(number):
    if type(number) == float:
        number = int(number)
    if number < 2:
        return False
    for i in range(2, number):
        if number % i == 0:
            return False
    return True

In [116]:
df['Is prime?'] = division.apply(lambda division_result: 'Yes' if prime(division_result) else 'No')

In [117]:
df[df['Is prime?'] == 'Yes'].head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes,New Rating,Is prime?
44,45,Lowriders,Drama,A young street artist in East Los Angeles is c...,Ricardo de Montreuil,"Gabriel Chavarria, Demián Bichir, Theo Rossi,T...",2016,99,6.3,279,4.21,57.0,Cat 1,0.042525,45,Yes
68,69,Wakefield,Drama,A man's nervous breakdown causes him to leave ...,Robin Swicord,"Bryan Cranston, Jennifer Garner, Beverly D'Ang...",2016,106,7.5,291,0.01,61.0,Cat 1,9.4e-05,69,Yes
112,113,The Bad Batch,"Romance,Sci-Fi",A dystopian love story in a Texas wasteland an...,Ana Lily Amirpour,"Keanu Reeves, Jason Momoa, Jim Carrey, Diego Luna",2016,118,6.1,512,,65.0,Cat 1,,113,Yes
293,294,The Exception,Drama,A German soldier tries to determine if the Dut...,David Leveaux,"Lily James, Jai Courtney, Christopher Plummer,...",2016,107,7.7,96,,,Cat 1,,294,Yes
307,308,Vincent N Roxxy,"Crime,Drama,Thriller",A small town loner and a rebellious punk rocke...,Gary Michael Schultz,"Emile Hirsch, Zoë Kravitz, Zoey Deutch,Emory C...",2016,110,5.5,403,,,Cat 1,,309,Yes


### Challenge 5. And finally some fantasy
Feel free to propose your own ranking based in aggregations of at least 3 columns of the dataset.

In [118]:
fantasy_ranking.loc[:, 'Is prime?'] = 'Yes'
fantasy_ranking.loc[:, 'bin'] = 'Cat 5'
fantasy_ranking.loc[:, 'New Rating'] = fantasy_ranking['New Rating'].max()
fantasy_ranking.loc[:, 'Revenue (Millions)'] = fantasy_ranking['Revenue (Millions)'].max()

In [119]:
fantasy_ranking.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,bin,Revenue per minutes,New Rating,Is prime?
36,37,Interstellar,"Adventure,Drama,Sci-Fi",A team of explorers travel through a wormhole ...,Christopher Nolan,"Matthew McConaughey, Anne Hathaway, Jessica Ch...",2014,169,8.6,1047747,623.28,74.0,Cat 5,1.112367,145,Yes
54,55,The Dark Knight,"Action,Crime,Drama",When the menace known as the Joker wreaks havo...,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart,Mi...",2008,152,9.0,1791916,623.28,82.0,Cat 5,3.508684,145,Yes
76,77,The Avengers,"Action,Sci-Fi",Earth's mightiest heroes must come together an...,Joss Whedon,"Robert Downey Jr., Chris Evans, Scarlett Johan...",2012,143,8.1,1045588,623.28,69.0,Cat 5,4.358601,145,Yes
80,81,Inception,"Action,Adventure,Sci-Fi","A thief, who steals corporate secrets through ...",Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...",2010,148,8.8,1583625,623.28,74.0,Cat 5,1.976824,145,Yes
124,125,The Dark Knight Rises,"Action,Thriller",Eight years after the Joker's reign of anarchy...,Christopher Nolan,"Christian Bale, Tom Hardy, Anne Hathaway,Gary ...",2012,164,8.5,1222645,623.28,78.0,Cat 5,2.7325,145,Yes


### Bonus challenge. Freaky bonus
We want to know which movies might have hidden paterns in their description. A way to know that is finding those movies which the sum of all numeric values of the string description hash (SHA256) are between their revenue and their number of votes.

In [138]:
df = pd.read_csv('data/input/IMDB-Movie-Data.csv')
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [150]:
# Your code here (https://stackoverflow.com/questions/70711801/how-to-hash-dataframe-column-to-sha256)

# 1. Hashear la descripción de la película
import hashlib

def sha_256(x):
    return hashlib.sha256(x.encode('utf-8')).hexdigest()

In [153]:
# 2. Duplicar columna (Movie description)

df['Description SHA256'] = df['Description'].apply(lambda x: sha_256(x))
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Description SHA256
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,89e55ad0e7d96003c037ac4fc2a4ebb0717338fdb9ce20...
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1d8d06e5f9ee801692a1c8dc80f71fc0cd268afeb13115...
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,f4149191d2a3f8a6effb0e5812a15cf84d82e70dd566e9...
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,8ace2c87f3f06d9d0b1a8cae5f2d049e86ff7e52528e83...
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,75fc4955bd5d81e20cd79b1be17f07d5571e4d07a07492...


In [154]:
# 3. Reemplazar los datos de la columna 'Description SHA256' con el resultado de la función def sum_ascii(x):

def sum_ascii(x):
    sum_char = 0
    for char in x:
        sum_char += ord(char)
    return sum_char

df['Description SHA256'] = df['Description SHA256'].apply(lambda x: sum_ascii(x))
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Description SHA256
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,4531
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,4580
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,4367
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,4645
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4408


In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
 12  Description SHA256  1000 non-null   int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 101.7+ KB


In [167]:
# Transformar df['Revenue (Millions)'] a enteros:

df['Revenue (Millions)'] = df['Revenue (Millions)'].fillna(0).astype('int64')

In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  1000 non-null   int64  
 11  Metascore           936 non-null    float64
 12  Description SHA256  1000 non-null   int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 101.7+ KB


In [172]:
# 4. Revisar con Booleanos, si 'Description SHA256' está entre los votos y los revenues (Millones)
df['Freaky Movie'] = df.apply(lambda row: row['Description SHA256'] >= row['Revenue (Millions)'] and row['Description SHA256'] <= row['Votes'], axis=1)
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Description SHA256,Freaky Movie
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333,76.0,4531,True
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126,65.0,4580,True
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138,62.0,4367,True
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270,59.0,4645,True
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325,40.0,4408,True


In [176]:
if df['Freaky Movie'].any() == False:
    print("There is at least a freaky movie in the DataFrame.")
else:
    print("All titles are freaky movies in the DataFrame.")

All titles are freaky movies in the DataFrame.
