# import packages

In [1]:
import pickle
import re
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans

In [2]:
from dataReader import *



# read data

In [3]:
with open('../data/processed/movies.pkl', 'rb') as movies_pkl:
    movies = pickle.load(movies_pkl)

# clustering

In [4]:
def returnGenre(s):
    p = re.finditer(r'name', s)
    position = [m.start(0) for m in p]
    allGenres = set()
    for pos in position:
        genre = ''
        for i in range(pos+8, len(s)):
            if s[i] == '\'':
                break
            genre += s[i]
        allGenres.add(genre)
    return allGenres

In [5]:
uniqueGenres = set()
uniqueLanguages = set()
for movie in movies:
    uniqueGenres.update(returnGenre(movie.genres))
    if isinstance(movie.lang, str):
        uniqueLanguages.add(movie.lang)

In [6]:
uniqueGenresDict = {k: v for v, k in enumerate(list(uniqueGenres))}
uniqueLanguagesDict = {k: v for v, k in enumerate(list(uniqueLanguages))}

In [7]:
X = []
for movie in movies:
    if not isinstance(movie.lang, str):
        language = len(uniqueLanguagesDict)
    else:
        language = uniqueLanguagesDict[movie.lang]
    revenue = movie.revenue
    vote = movie.vote_average
    num_female = len(movie.cast.get_female_cast())
    num_male = len(movie.cast.get_male_cast())
    genres = returnGenre(movie.genres)
    for g in genres:
        genre = uniqueGenresDict[g]
        line = [language, revenue, vote, num_female, num_male, genre]
    X.append(line)
X = np.array(X)

In [8]:
# clustering = SpectralClustering(n_clusters=5).fit(X)
clustering = KMeans(n_clusters=20).fit(X)

In [9]:
allData = np.c_[X,clustering.labels_]

In [10]:
labelGenre = {}
for row in allData:
    label = int(row[6])
    if label in labelGenre:
        labelGenre[label].append(int(row[5]))
    else:
        labelGenre[label] = [int(row[5])]
for item in labelGenre:
    lst = labelGenre[item]
    dom_genre = max(set(lst), key=lst.count)
    labelGenre[item] = list(uniqueGenresDict.keys())[list(uniqueGenresDict.values()).index(dom_genre)]

In [11]:
labelFemale = {}
labelMale = {}
for row in allData:
    label = int(row[6])
    if label in labelFemale:
        labelFemale[label].append(int(row[3]))
    else:
        labelFemale[label] = [int(row[3])]
    if label in labelMale:
        labelMale[label].append(int(row[4]))
    else:
        labelMale[label] = [int(row[4])]
for label in labelFemale:
    lst = labelFemale[label]
    labelFemale[label] = sum(lst)/len(lst)
for label in labelMale:
    lst = labelMale[label]
    labelMale[label] = sum(lst)/len(lst)

In [12]:
for label in labelGenre:
    genre = labelGenre[label]
    percFemale = round(float(labelFemale[label])/(labelFemale[label]+labelMale[label]), 3)
    percMale = round(float(labelMale[label])/(labelFemale[label]+labelMale[label]), 3)
    print (label, genre, percFemale, percMale)

0 Comedy 0.332 0.668
17 Comedy 0.327 0.673
10 Science Fiction 0.3 0.7
14 Fantasy 0.324 0.676
8 Science Fiction 0.327 0.673
13 Science Fiction 0.312 0.688
4 Comedy 0.334 0.666
7 Comedy 0.335 0.665
3 Comedy 0.325 0.675
6 Comedy 0.348 0.652
18 Science Fiction 0.237 0.763
16 Comedy 0.332 0.668
2 Fantasy 0.233 0.767
12 Comedy 0.32 0.68
1 Science Fiction 0.277 0.723
19 Comedy 0.342 0.658
11 Science Fiction 0.346 0.654
15 Animation 0.378 0.622
9 Science Fiction 0.375 0.625
5 Science Fiction 0.314 0.686


Interesting things to note:

- In most genres, the distribution is 0.3, 0.7
- Fantasy and Science Fiction movies have higher percentage of female cast than other genres
- Family movies have the highest percentage of male cast than other genres (weird?)

# similarity

In [13]:
setX = []
for movie in movies:
    index = movie.id
    if not isinstance(movie.lang, str):
        language = ''
    else:
        language = movie.lang
    genres = returnGenre(movie.genres)
    female = []
    for cast in movie.cast.get_female_cast():
        female.append(cast['name'])
    male = []
    for cast in movie.cast.get_male_cast():
        male.append(cast['name'])
    movie_data = [index, language] + list(genres) + female + male
    setX.append(movie_data)

In [24]:
%%time
t = [set(x) for x in setX]

CPU times: user 462 ms, sys: 12 ms, total: 474 ms
Wall time: 476 ms


In [25]:
labelSets[0]

[[2, 'fi', 'Drama', 'Crime', 'Matti Pellonpää'],
 [3,
  'fi',
  'Drama',
  'Comedy',
  'Kati Outinen',
  'Matti Pellonpää',
  'Sakari Kuosmanen',
  'Esko Nikkari',
  'Pekka Laiho'],
 [5,
  'en',
  'Comedy',
  'Crime',
  'Jennifer Beals',
  'Madonna',
  'Marisa Tomei',
  'Sammi Davis',
  'Amanda de Cadenet',
  'Valeria Golino',
  'Lili Taylor',
  'Ione Skye',
  'Alicia Witt',
  'Lana McKissack',
  'Tamlyn Tomita',
  'Kathy Griffin',
  'Salma Hayek',
  'Patricia Vonne',
  'Tim Roth',
  'Antonio Banderas',
  'Bruce Willis',
  'Quentin Tarantino',
  'Lawrence Bender',
  'David Proval',
  'Paul Calderon',
  'Marc Lawrence'],
 [17,
  'en',
  'Mystery',
  'Thriller',
  'Horror',
  'Maria Bello',
  'Abigail Stone',
  'Sophie Stuckey',
  'Sean Bean',
  'Richard Elfyn',
  'Maurice Roeves'],
 [19,
  'de',
  'Drama',
  'Science Fiction',
  'Brigitte Helm',
  'Margarete Lanner',
  'Grete Berger',
  'Helen von Münchofen',
  'Alfred Abel',
  'Gustav Fröhlich',
  'Rudolf Klein-Rogge',
  'Fritz Rasp',


In [14]:
labelSets = {}
for i in range(len(allData)):
    label = clustering.labels_[i]
    labelset = setX[i]
    if label in labelSets:
        labelSets[label].append(labelset)
    else:
        labelSets[label] = [labelset]

In [54]:
def jsimilarity(a, b):
    c = a.intersection(b)
    return len(c)/ (len(a) + len(b) - len(c))

for label in tqdm(labelSets):
    max_similarity = 0
    sets = labelSets[label]
    for a in tqdm(sets):
        for b in sets:
            a = set(a)
            b = set(b)
            # if a == b:
            #     continue
            similarity = jsimilarity(a, b)
            if similarity >= max_similarity:
                max_similarity = similarity
                most_similar = (a,b)
                if max_similarity == 1:
                    break
    print (label, labelGenre[label], most_similar, round(max_similarity,3))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=41855), HTML(value='')))

KeyboardInterrupt: 

In [32]:
from scipy.spatial.distance import pdist

In [52]:
def get_sim_matrix(X):
    sim_mat = []
    for i, x in tqdm(enumerate(X), total=len(X)):
        for j, y in enumerate(X[i:], start=i):
            sim_mat.append(jsimilarity(set(x), set(y)))