In [1]:
# IMPORTANT : si l'execution déclenche une erreur et demande nltk.download, 
# faire executer ces 2 lignes : 
# nltk.download('punkt')
# nltk.download("stopwords")

In [2]:
import requests
import nltk
import re

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from sklearn.cluster import KMeans
from string import punctuation
#Initiatlisation du troncateur de mot
st=LancasterStemmer()
#fabrication de ma liste de stopwords
custo_stopwords = set(stopwords.words('english')+list(punctuation)+["film","director","born"])

#---------------------------------------------
# FONCTION
# input : director name (case insensitive)
# output : a dictionary of (word;frequency) based on wikipedia summary
#---------------------------------------------
def GetWikipediaTokenizedSummary(director):
    # Récupération de la page exacte wikipedia (après redirection) à partir du nom de l'auteur non normalisé
    director.replace(" ","+")
    mysearchURL = "https://en.wikipedia.org/w/index.php?search="+director
    mysummaryURL = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=true&exintro&explaintext&titles="
    http = requests.get(mysearchURL)
    #On vire les 30 premiers caractères pour trouver le nom normalisé de l'auteur
    exactURL = mysummaryURL + http.url[30:]
    #Récupération du résumé
    fullresults= requests.get(exactURL).text
    summaryindex = fullresults.find("extract")
    summary =fullresults[summaryindex+10:-5]

    #on vire les caractères spéciaux
    summary.encode('ascii', 'replace')

    #Tokenisation
    sents = sent_tokenize(summary)
    words=[word_tokenize(sent) for sent in sents]

    #Nettoyage de la liste des mots selon certains critères, et décompte
    final_words = dict()
    for sent in words:
        for word in sent:
            word = word.lower()

            # On ignore le nom de l'auteur
            found=False
            for director_part in director.split(" "):
                if word.find(director_part.lower())!=-1:
                    found=True
            if found:
                continue

            #Finalement je ne tronque pas les mots, ça gêne l'interpretation
            #word = st.stem(word)

            # On ignore les mots avec des digits ou antislash dans la liste des stopwords
            if word in custo_stopwords or re.search("[0-9\\\\]+", word):
                continue
            # On ignore les mots de 2 lettres
            if len(word)<3 :
                continue

            #Si on arrive là alors on peut compter le mot    
            if word in final_words:
                final_words[word] += 1
            else:
                final_words[word] = 1
    return final_words
#Si on veut afficher le résultat en triant par occurences décroissantes
#for key in sorted(final_words, key=final_words.get, reverse=True):
#    print(key+":"+str(final_words[key]))

In [10]:
# Le plan
# 1. Récupérer pour les 250 réalisateurs les mieux notés en moyenne
# 2. Créer une matrice géante, index = réalisateur, colonnes = tous les mots
# 3. Appliquer une pondération IDF (pondérer chaque mot par l'inverse de la fréquence)
# 4. Lancer un algo de clustering

#Step 1
df = pd.read_csv("top250_acclaimed_directors.csv", sep=";", low_memory=False, names = ["id", "director_name"])
df.set_index("director_name",inplace=True)
df.drop("id",axis=1,inplace=True)

In [43]:
x=0
#Step 2
for x in range(len(df)):
    dict_tokens = GetWikipediaTokenizedSummary(df.iloc[x].name)
    print("Retrieving ("+str(x)+ "/" + str(len(df)) +") summary for : "+df.iloc[x].name+","+str(len(dict_tokens))+" distinct words")
    for word in dict_tokens:
        label = df.iloc[x].name
        df.loc[label,word]=dict_tokens[word]
    x+=1
# defaulting des valeurs NaN
df.fillna(0, inplace=True)
df_backup = df.copy()

Retrieving (0/250) summary for : Todd Kessler,17 distinct words
Retrieving (1/250) summary for : John Krokidas,12 distinct words
Retrieving (2/250) summary for : Josh Boone,7 distinct words
Retrieving (3/250) summary for : David Foenkinos,22 distinct words
Retrieving (4/250) summary for : Michael Grandage,11 distinct words
Retrieving (5/250) summary for : Lake Bell,45 distinct words
Retrieving (6/250) summary for : Bruce Timm,19 distinct words
Retrieving (7/250) summary for : Charlie Kaufman,64 distinct words
Retrieving (8/250) summary for : Houda Benyamina,29 distinct words
Retrieving (9/250) summary for : Chris McKay,38 distinct words
Retrieving (10/250) summary for : Jonathan Dayton,33 distinct words
Retrieving (11/250) summary for : Louis D'Esposito,76 distinct words
Retrieving (12/250) summary for : Marco Berger,46 distinct words
Retrieving (13/250) summary for : Jan Komasa,7 distinct words
Retrieving (14/250) summary for : John Kahrs,50 distinct words
Retrieving (15/250) summary 

Retrieving (124/250) summary for : James Parrott,9 distinct words
Retrieving (125/250) summary for : Bill Paxton,37 distinct words
Retrieving (126/250) summary for : Mark Romanek,48 distinct words
Retrieving (127/250) summary for : Gabriel Axel,14 distinct words
Retrieving (128/250) summary for : Nick Park,76 distinct words
Retrieving (129/250) summary for : James W. Horne,19 distinct words
Retrieving (130/250) summary for : David Fincher,52 distinct words
Retrieving (131/250) summary for : Jon M. Chu,45 distinct words
Retrieving (132/250) summary for : Gary Hustwit,57 distinct words
Retrieving (133/250) summary for : Eli Craig,31 distinct words
Retrieving (134/250) summary for : Jorge R. Gutierrez,28 distinct words
Retrieving (135/250) summary for : Tsuneo Kobayashi,14 distinct words
Retrieving (136/250) summary for : Ryan Coogler,24 distinct words
Retrieving (137/250) summary for : Bill Pohlad,22 distinct words
Retrieving (138/250) summary for : Giulio Ricciarelli,5 distinct words
Re

Retrieving (247/250) summary for : Martin Scorsese,162 distinct words
Retrieving (248/250) summary for : Sanjay Leela Bhansali,108 distinct words
Retrieving (249/250) summary for : Tom Shankland,32 distinct words


In [41]:
df.describe()

Unnamed: 0,award,winning,american,television,writer,producer,among,credits,showrunner,co-creator,...,goliyon,box-office,collections,kom,bajirao,mastani,otto,fades,ripper,bait
count,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,...,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0
mean,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,...,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004,0.004
std,0.008057,0.019696,0.006758,0.009512,0.009715,0.006831,0.013975,0.025559,0.044631,0.036368,...,0.063246,0.063246,0.063246,0.063246,0.063246,0.063246,0.063246,0.063246,0.063246,0.063246
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.006369,0.0,0.007194,0.0,0.0,0.010204,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.057325,0.153846,0.064748,0.057143,0.05,0.05102,0.052632,0.166667,0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [60]:
#On vire tous les mots qui apparaissent une fois (probablement des titres de film)
serie = df.sum(axis=0)
for key in df:
    if serie[key]==1:
        df.drop(key,axis=1,inplace=True)
        print("dropped : "+key)


dropped : preschool
dropped : clues
dropped : kill
dropped : darlings
dropped : soccer
dropped : bestseller
dropped : audrey
dropped : tautou
dropped : renaudot
dropped : donmar
dropped : warehouse
dropped : siegel
dropped : boston
dropped : legal
dropped : surface
dropped : childrens
dropped : complicated
dropped : dollar
dropped : arm
dropped : escape
dropped : worst
dropped : ...
dropped : stuart
dropped : synecdoche
dropped : well-received
dropped : celebrated
dropped : sister
dropped : oulaya
dropped : paradise
dropped : divines
dropped : taylor
dropped : moral
dropped : orel
dropped : phil
dropped : nightwing
dropped : jersey
dropped : sign
dropped : constitution
dropped : speaker
dropped : senate
dropped : arrested
dropped : treason
dropped : connection
dropped : aaron
dropped : burr
dropped : conspiracy
dropped : recovered
dropped : llc
dropped : burbank
dropped : wholly
dropped : division
dropped : feige
dropped : reorganized
dropped : marvel-character
dropped : exceeded
dropp

dropped : telecom
dropped : cagliostro
dropped : valley
dropped : neighbor
dropped : totoro
dropped : porco
dropped : rosso
dropped : mononoke
dropped : spirited
dropped : away
dropped : ponyo
dropped : rises
dropped : announced
dropped : recurrence
dropped : wholesomeness
dropped : patterns
dropped : importance
dropped : craftsmanship
dropped : difficulty
dropped : maintaining
dropped : pacifist
dropped : ethic
dropped : violent
dropped : present
dropped : antagonists
dropped : redeeming
dropped : qualities
dropped : impact
dropped : suggested
dropped : praising
dropped : depth
dropped : artistry
dropped : world-renowned
dropped : jack-jack
dropped : voices
dropped : newsreel
dropped : skinner
dropped : 'steward
dropped : robots
dropped : chatter
dropped : telephone
dropped : mini
dropped : toons
dropped : untitled
dropped : derek
dropped : connolly
dropped : inventor
dropped : film-making
dropped : juxtaposing
dropped : extreme
dropped : close-up
dropped : sword
dropped : sandal
drop

dropped : offered
dropped : rocko
dropped : green
dropped : beret
dropped : wormholes
dropped : finale
dropped : resigned
dropped : blvd.
dropped : usa
dropped : sponge
dropped : accolade
dropped : heal
dropped : bay
dropped : elevating
dropped : cartoonists
dropped : despite
dropped : controversies
dropped : speculation
dropped : sexual
dropped : orientation
dropped : lawsuit
dropped : filed
dropped : diagnosed
dropped : amyotrophic
dropped : lateral
dropped : sclerosis
dropped : als
dropped : stated
dropped : continue
dropped : spiegel
dropped : kaufman
dropped : daft
dropped : punk
dropped : fatboy
dropped : slim
dropped : weezer
dropped : beastie
dropped : kanye
dropped : jackass
dropped : inc..
dropped : skateboard
dropped : skateboards
dropped : riders
dropped : rick
dropped : howard
dropped : mike
dropped : cunningham
dropped : michel
dropped : gondry
dropped : palm
dropped : brownsburg
dropped : herald
dropped : tribune
dropped : join
dropped : newspaper
dropped : sunday
droppe

dropped : scripted
dropped : dangal
dropped : beijing
dropped : brics
dropped : fifth
dropped : non-english
dropped : collected
dropped : highest-grossers
dropped : telstra
dropped : choice
dropped : caldwell
dropped : polish
dropped : officer
dropped : adventurer
dropped : squadron
dropped : prisoner
dropped : start
dropped : explorers
dropped : documenting
dropped : board
dropped : pan
dropped : airways
dropped : always
dropped : priority
dropped : rko
dropped : co-inventor
dropped : cinerama
dropped : projection
dropped : robo
dropped : ova
dropped : lunar
dropped : idolm
dropped : ster
dropped : swiss
dropped : sierre
dropped : illustration
dropped : emile
dropped : cohl
dropped : ecal
dropped : cantonale
dropped : lausanne
dropped : freelance
dropped : ravioli
dropped : courgette
dropped : oficial
dropped : iranian
dropped : separation
dropped : syrian
dropped : halloween
dropped : mohammad
dropped : messenger
dropped : daughter
dropped : rima
dropped : monla
dropped : amman
dropp

dropped : tron
dropped : sculptor
dropped : trained
dropped : taught
dropped : hound
dropped : supervisors
dropped : cutting-edge
dropped : adapting
dropped : resulting
dropped : teamed
dropped : calarts
dropped : alum
dropped : co-write
dropped : featurettes
dropped : doom
dropped : simon-penned
dropped : marrying
dropped : jam
dropped : neverland
dropped : cronkite
dropped : disneyland
dropped : educational
dropped : tourist
dropped : hell
dropped : backstage
dropped : leiva
dropped : seventy-five
dropped : percent
dropped : storyboarded
dropped : weeks
dropped : recording
dropped : mgm
dropped : switched
dropped : chiefs
dropped : tentatively
dropped : record-setting
dropped : multimedia
dropped : parks
dropped : dangerous
dropped : sequences
dropped : cranium
dropped : command
dropped : epcot
dropped : spectacular
dropped : roller
dropped : coaster
dropped : extra-terrorestrial
dropped : encounter
dropped : magic
dropped : cinemagique
dropped : pre-ride
dropped : ride
dropped : ran

dropped : mental
dropped : deconstruction
dropped : moyoko
dropped : animage
dropped : nadia
dropped : avant-garde
dropped : face
dropped : accomplishing
dropped : trabzon
dropped : istanbul
dropped : turkey
dropped : arzu
dropped : plumpe
dropped : schopenhauer
dropped : nietzsche
dropped : ibsen
dropped : seen
dropped : reinhardt
dropped : commander
dropped : eastern
dropped : front
dropped : crashes
dropped : without
dropped : severe
dropped : nosferatu
dropped : bram
dropped : dracula
dropped : copyright
dropped : expressionist
dropped : laugh
dropped : interpretation
dropped : goethe
dropped : faust
dropped : emigrated
dropped : sunrise
dropped : devils
dropped : travelled
dropped : flaherty
dropped : disputes
dropped : finish
dropped : injuries
dropped : automobile
dropped : occurred
dropped : pacific
dropped : coast
dropped : rincon
dropped : southeast
dropped : reel
dropped : marizza
dropped : genannt
dropped : schmuggler-madonna
dropped : survives
dropped : leaves
dropped : en

In [59]:
df = df_backup.copy()

In [62]:
# On pondère les mots par leur nb d'occurence (en fait je suis juste en train de recoder TfidfVectorizer lol)
# pour renforcer l'importance des mots qui apparaissent peu
for key in df:
    df[key] = df[key]/df.sum(axis=0)[key]

In [64]:
#On clusterise
km = KMeans(n_clusters = 10,init="k-means++",max_iter=100,n_init=1,verbose=True)
X = df.loc[:,:]
km.fit(X)

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 554.746970605
start iteration
done sorting
end inner loop
Iteration 1, inertia 552.980226953
start iteration
done sorting
end inner loop
Iteration 2, inertia 551.279111963
start iteration
done sorting
end inner loop
Iteration 3, inertia 551.279111963
center shift 0.000000e+00 within tolerance 1.715990e-07


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=10, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [65]:
km.labels_

array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 4,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 7,
       7, 7, 7, 7, 7, 7, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])

In [67]:
#Ecriture dans un fichier
df_backup.to_csv("temp.csv",encoding='utf-8', sep=";")