In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
def transform_data(data):
    result = []
    mid = data['output']['movieId']
    for idx, cluster in enumerate(data['output']['clusters']):
        for tag in cluster['tags']:
            row = {}
            row['movieId'] = mid
            row['cluster'] = idx
            row['assignment_id'] = data['assignment_id']
            row['worker_id'] = data['worker_id']
            row['hit_id'] = data['hit_id']
            row['tag'] = tag['name']
            row['not_fit'] = 0 if not tag.get('Does not fit') else 1
            row['inappropriate'] = 0 if not tag.get('Inappropriate') else 1
            row['best'] = 1 if cluster.get('Best tag') == tag['name'] else 0
            result.append(row)
    return result
    

In [4]:
trans_output = []
with open('./pilot_3_result.txt') as f:
    for line in f.readlines():
        turk_output = dict(json.loads(line))
        trans_output += transform_data(turk_output)

In [5]:
def pick_best_tag(x):
    x['cluster_label'] = x.ix[np.argmax(x['best']['mean'])]['tag'].values[0]
    return x


In [6]:
result = pd.DataFrame(trans_output)
result_agg = result.groupby(['movieId', 'cluster', 'tag'])[['best', 'not_fit', 'inappropriate']].agg([np.mean, np.sum])
result_agg = result_agg[(result_agg['not_fit']['mean']<0.5) & (result_agg['inappropriate']['mean']<=0.5)].reset_index()
result_agg = result_agg.groupby(['movieId', 'cluster']).apply(pick_best_tag)

In [7]:
result_agg

Unnamed: 0_level_0,movieId,cluster,tag,best,best,not_fit,not_fit,inappropriate,inappropriate,cluster_label
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,sum,mean,sum,mean,sum,Unnamed: 10_level_1
0,300,0,business,0.0,0,0.2,1,0.0,0,cheating
1,300,0,cheating,0.4,2,0.0,0,0.2,1,cheating
2,300,0,corruption,0.4,2,0.0,0,0.2,1,cheating
3,300,0,crime,0.0,0,0.0,0,0.4,2,cheating
4,300,0,greed,0.0,0,0.0,0,0.0,0,cheating
5,300,0,investigation,0.0,0,0.0,0,0.0,0,cheating
6,300,0,manipulation,0.0,0,0.0,0,0.0,0,cheating
7,300,0,morality,0.2,1,0.0,0,0.0,0,cheating
8,300,0,obsession,0.0,0,0.2,1,0.0,0,cheating
9,300,1,1950s,0.8,4,0.0,0,0.0,0,1950s


In [109]:
prev_mid = -1

output = []
movie_dict = {}
for row in result_agg.iterrows():
    mid = row[1].movieId
    if mid != prev_mid:
        if movie_dict:
            output.append(json.dumps(movie_dict))
        movie_dict = {}
    movie_dict['movieId'] = mid
    movie_dict['clusters'] = []
        
    output.append({"movieId": mid})

In [199]:
tmp = result_agg.groupby(['movieId', 'cluster_label'])['tag'].agg(lambda x: tuple(x)).reset_index()
tmp = tmp.groupby('movieId').apply(lambda x: x.to_dict(orient='records')).reset_index()

In [169]:
with open('processed_results_pilot_2.txt', 'w') as f:
    for movie in tmp.iterrows():
        f.write(json.dumps({'movieId': movie[1]['movieId'], 'clusters': movie[1][0]}) + '\n')


In [207]:
output_movies = []
for movie in tmp.iterrows():
    output_movies.append({'movieId': movie[1]['movieId'], 'clusters': movie[1][0]}) 

In [208]:
output_movies

[{'clusters': [{'cluster_label': u'drama',
    'movieId': 1213,
    'tag': (u'dialogue', u'drama', u'masterpiece', u'storytelling')},
   {'cluster_label': u'mafia',
    'movieId': 1213,
    'tag': (u'crime', u'gangster', u'gangsters', u'mafia', u'mob')},
   {'cluster_label': u'original',
    'movieId': 1213,
    'tag': (u'interesting', u'original')},
   {'cluster_label': u'stylish',
    'movieId': 1213,
    'tag': (u'stylish', u'visceral')}],
  'movieId': 1213},
 {'clusters': [{'cluster_label': u'foreign',
    'movieId': 2324,
    'tag': (u'foreign',)},
   {'cluster_label': u'holocaust',
    'movieId': 2324,
    'tag': (u'camp', u'holocaust', u'jews', u'nazi', u'nazis', u'wwii')},
   {'cluster_label': u'storytelling',
    'movieId': 2324,
    'tag': (u'masterpiece', u'storytelling')},
   {'cluster_label': u'survival',
    'movieId': 2324,
    'tag': (u'love', u'survival', u'tragedy')},
   {'cluster_label': u'touching',
    'movieId': 2324,
    'tag': (u'bittersweet',
     u'dramatic',


In [12]:
movie_info = {}
with open('turk_input_movies_pilot_2.txt', 'r') as f:
  for line in f.readlines():
    movie = dict(json.loads(line))
    movie_info[movie['movieId']] = movie

In [15]:
movie_info

{1213: {u'clusters': [{u'tags': [{u'name': u'mafia'},
     {u'name': u'gangster'},
     {u'name': u'gangsters'},
     {u'name': u'mob'},
     {u'name': u'crime'},
     {u'name': u'mentor'}]},
   {u'tags': [{u'name': u'violent'},
     {u'name': u'narrated'},
     {u'name': u'violence'},
     {u'name': u'stylish'},
     {u'name': u'visceral'},
     {u'name': u'stylized'},
     {u'name': u'bloody'},
     {u'name': u'brutality'}]},
   {u'tags': [{u'name': u'masterpiece'},
     {u'name': u'storytelling'},
     {u'name': u'drama'},
     {u'name': u'dialogue'}]},
   {u'tags': [{u'name': u'interesting'}, {u'name': u'original'}]}],
  u'movieDescription': u'Henry Hill is a small time gangster, who takes part in a robbery with Jimmy Conway and Tommy De Vito, two other gangsters who have set their sights a bit higher. His two partners kill off everyone else involved in the robbery, and slowly start to climb up through the hierarchy of the Mob. Henry, however, is badly affected by his partners succ