In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
import random

data = pd.read_csv(r'dados/movie_metadata.csv')
budgets = [0.1, 1, 10]

data.drop_duplicates()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


# Consultas

In [2]:
# Consulta 1: Filme com a maior arrecadação
def consulta1(data):
    aux = data[['movie_title', 'gross'][:]].dropna()
    return aux.loc[aux.gross.idxmax()]

# Consulta 2: Filme com maior arrecadação para cada idioma
def consulta2(data):
    aux = data[['movie_title', 'language', 'gross'][:]].dropna()
    return aux.loc[aux.groupby('language').gross.idxmax()]

# Consulta 3: Top 3 países com a maior quantidade de filmes
def consulta3(data):
    result = []
    aux = data[['country', 'movie_title'][:]].dropna()
    country_list = aux.groupby('country')['movie_title'].nunique().to_frame()
    country_list['country'] = country_list.index # o índice se perde ao agrupar
    country_list.reset_index(drop=True, inplace=True)
    for i in range(3):
        result.append(country_list.loc[country_list.movie_title.idxmax()])
        country_list = country_list.drop(country_list.movie_title.idxmax())
    return pd.DataFrame(result)

# Funções Score

In [3]:
def score1(item, out):
    score = 0
    if(not np.isnan(item.gross) and item.movie_title != 'None'):
        if(item.movie_title == out):
            score = item.gross
    return score

# o score da segunda consulta é igual ao da primeira

def score3(data, item, out):
    score = 0
    country_list = data.country.array
    countries = {}
    for i in range(0, len(country_list)):
        if(country_list[i] not in countries):
            countries[country_list[i]] = 0
    for i in range(0, len(data)):
        countries[data.iloc[i].country] += 1
    if(item.country == out):
        score = countries[item.country]
    return score

# Funções de sensibilidade

In [4]:
def sense_q1(data):
    mat = []
    sense = 0
    results = data[['movie_title'][:]]
    for i in range(0, len(results)):
        mat.append(score1(data.iloc[i], results.iloc[i].movie_title))
        
    for i in range(0, len(mat) - 1):
        sense = max([sense, abs(mat[i] - mat[i + 1])])
    return sense

# como os dois scores são iguais, a função de sensibilidade pode ser a mesma

def sense_q3(data):
    sense = 0
    country_list = []
    mat = []
    for i in range(0, len(data)):
        if(data.iloc[i].country not in country_list and data.iloc[i].country != 'None'):
            country_list.append(data.iloc[i].country)
    results = list(combinations(country_list, 3))
    
    for r in range(0, len(results)):
        mat.append(score3(data, data.iloc[r], results[r]))

    for i in range(0, len(mat) - 1):
        sense = max([sense, abs(mat[i] - mat[i + 1])])
    return sense

# Randomização de consultas

In [5]:
def rand_q1(budget, sense, data):
    outputs = data.movie_title
    probs = []
    pk = []
    k = np.arange(len(data))
    
    for i in outputs:
        row = []
        for j in range(0, len(data)):
            num = np.exp(budget * score1(data.iloc[j], i) / (2 * sense))
            den = 0
            for k in outputs:
                den += np.exp(budget * score1(data.iloc[j], k) / (2 * sense))
            row.append(num / den)
        i += 1
        probs.append(row)
        
    for i in probs:
        pk.append(np.sum(i) / np.sum(probs))
    return data[stats.rv_discrete(name='dist', values=(k, pk)).rvs(), 11]

    
def rand_q2(budget, sense, data):
    out = []
    lang = np.unique(data.language)
    clusters = len(lang) * [[]]
    for i in range(0, len(lang)):
        clang = []
        for j in range(0, len(data)):
            if(data.language == lang[i]):
                clang.append(data.iloc[j])
                clusters[i] = clang
    for i in clang:
        out.append(rand_q1(budget, sense, np.array(i)))
    return out


def rand_q3(budget, sense, data):
    budget /= 3
    probs = []
    pk = []
    rand = []
    out = np.unique(data.country)
    k = np.arange(len(out))
    
    for i in out:
        row = []
        for j in range(0, len(data)):
            num = np.exp(budget * score3(data.iloc[j], i) / (2 * sense))
            den = 0
            for k in out:
                den += np.exp(budget * score3(data.iloc[j], i) / (2 * sense))
            row.append(num / den)
        probs.append(row)
        
    for i in probs:
        pk.append(np.sum(prob) / np.sum(probs))

    for i in range(0, 3):
        is_valid = False
        while(not is_valid):
            reg = stats.rv_discrete(name='dist', values=(k, pk)).rvs()
            if(out[reg] not in rand):
                rand.append(out[reg])
                valido = True
    return rand

# Execução

In [None]:
sense1 = sense_q1(data)

queries = [consulta1, consulta2, consulta3]
rand_queries = [rand_q1, rand_q2, rand_q3]
senses = [sense1, sense1, sense_q3(data)]

def dp_exponential(budgets, res, senses):
    res_data = []
    for i in range(0, len(budgets)):
        reg = [budgets[i]]
        for j in range(0, len(res[i])):
            reg.append(res[i][j])
        for k in range(0, len(senses)):
            reg.append(senses[k])
        res_data.append(reg)
    res_data = pd.DataFrame(res_data, columns=['budget', 'result_q1', 'result_q2', 'result_q3', 'sens_q1', 'sens_q2', 'sens_q3'])
    res_data.to_csv(r'saida/result.csv', index=False)
    
def responses(data, senses, budgets, rand_list):
    result = []
    for i in budgets:
        res = []
        for j in range(0, len(rand_list)):
            q = rand_list[j](i, senses[j], data)
            res.append(q)
        result.append(res)
    return result
            
total_results = responses(data, senses, budgets, rand_queries)  

In [None]:
dp_exponential(budgets, total_results, senses)