# import packages

In [1]:
import pickle
import re
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans

In [2]:
from dataReader import *



# read data

In [4]:
with open('../data/processed/movies.pkl', 'rb') as movies_pkl:
    movies = pickle.load(movies_pkl)

# clustering

In [5]:
def returnGenre(s):
    p = re.finditer(r'name', s)
    position = [m.start(0) for m in p]
    allGenres = set()
    for pos in position:
        genre = ''
        for i in range(pos+8, len(s)):
            if s[i] == '\'':
                break
            genre += s[i]
        allGenres.add(genre)
    return allGenres

In [6]:
uniqueGenres = set()
uniqueLanguages = set()
for movie in movies:
    uniqueGenres.update(returnGenre(movie.genres))
    if isinstance(movie.lang, str):
        uniqueLanguages.add(movie.lang)

In [7]:
uniqueGenresDict = {k: v for v, k in enumerate(list(uniqueGenres))}
uniqueLanguagesDict = {k: v for v, k in enumerate(list(uniqueLanguages))}

In [74]:
X = []
for movie in movies:
    if not isinstance(movie.lang, str):
        language = len(uniqueLanguagesDict)
    else:
        language = uniqueLanguagesDict[movie.lang]
    revenue = movie.revenue
    vote = movie.vote_average
    num_female = len(movie.cast.get_female_cast())
    num_male = len(movie.cast.get_male_cast())
    genres = returnGenre(movie.genres)
    for g in genres:
        genre = uniqueGenresDict[g]
        line = [language, revenue, vote, num_female, num_male, genre]
    X.append(line)
X = np.array(X)

In [102]:
# clustering = SpectralClustering(n_clusters=5).fit(X)
clustering = KMeans(n_clusters=20).fit(X)

In [104]:
allData = np.c_[X,clustering.labels_]

In [123]:
labelGenre = {}
for row in allData:
    label = int(row[6])
    if label in labelGenre:
        labelGenre[label].append(int(row[5]))
    else:
        labelGenre[label] = [int(row[5])]
for item in labelGenre:
    lst = labelGenre[item]
    dom_genre = max(set(lst), key=lst.count)
    labelGenre[item] = list(uniqueGenresDict.keys())[list(uniqueGenresDict.values()).index(dom_genre)]

In [124]:
labelFemale = {}
labelMale = {}
for row in allData:
    label = int(row[6])
    if label in labelFemale:
        labelFemale[label].append(int(row[3]))
    else:
        labelFemale[label] = [int(row[3])]
    if label in labelMale:
        labelMale[label].append(int(row[4]))
    else:
        labelMale[label] = [int(row[4])]
for label in labelFemale:
    lst = labelFemale[label]
    labelFemale[label] = sum(lst)/len(lst)
for label in labelMale:
    lst = labelMale[label]
    labelMale[label] = sum(lst)/len(lst)

In [125]:
for label in labelGenre:
    genre = labelGenre[label]
    percFemale = round(float(labelFemale[label])/(labelFemale[label]+labelMale[label]), 3)
    percMale = round(float(labelMale[label])/(labelFemale[label]+labelMale[label]), 3)
    print (label, genre, percFemale, percMale)

(0, 'Comedy', 0.333, 0.667)
(1, 'Family', 0.238, 0.762)
(2, 'Comedy', 0.3, 0.7)
(3, 'Adventure', 0.323, 0.677)
(4, 'Comedy', 0.313, 0.688)
(5, 'Fantasy', 0.349, 0.651)
(6, 'Family', 0.318, 0.682)
(7, 'Family', 0.308, 0.692)
(8, 'Comedy', 0.313, 0.688)
(9, 'Adventure', 0.31, 0.69)
(10, 'Adventure', 0.327, 0.673)
(11, 'Comedy', 0.333, 0.667)
(12, 'Crime', 0.316, 0.684)
(13, 'Thriller', 0.25, 0.75)
(14, 'Science Fiction', 0.375, 0.625)
(15, 'Comedy', 0.333, 0.667)
(16, 'Family', 0.35, 0.65)
(17, 'Comedy', 0.353, 0.647)
(18, 'Family', 0.229, 0.771)
(19, 'Adventure', 0.303, 0.697)


Interesting things to note:

- In most genres, the distribution is 0.3, 0.7
- Fantasy and Science Fiction movies have higher percentage of female cast than other genres
- Family movies have the highest percentage of male cast than other genres (weird?)

# similarity