# Clasificación de películas usando las tramas

## Clasificación de géneros

In [0]:
import os
import random

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression

from IPython.display import HTML, display
from tabulate import tabulate

plt.style.use('ggplot')

In [0]:
wiki_plots = pd.read_csv("wiki_movie_plots_deduped.csv")
wiki_plots.rename(columns={'Origin/Ethnicity':'Origin'}, inplace=True) # para mayor comodidad
wiki_plots = wiki_plots.drop_duplicates(subset='Plot', keep='first')

### Reducción de los géneros

Se eliminan todos los géneros repetidos y se cambia el formato para multiples géneros a uno donde se escriben los nombres de los géneros separados por un "|"

In [0]:
"""
This code is taken from:https://www.kaggle.com/aminejallouli/genre-classification-based-on-wiki-movies-plots
and it's a little bit modified.
"""
wiki_plots['GenreCorrected'] =wiki_plots['Genre'] 
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.strip()
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' - ', '|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' / ', '|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('/', '|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' & ', '|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(', ', '|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('; ', '|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('bio-pic', 'biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biopic', 'biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biographical', 'biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biodrama', 'biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('bio-drama', 'biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biographic', 'biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(film genre\)', '')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('animated','animation')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('anime','animation')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('children\'s','children')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('comedey','comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\[not in citation given\]','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' set 4,000 years ago in the canadian arctic','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('historical','history')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('romantic','romance')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('3-d','animation')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('3d','animation')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('viacom 18 motion pictures','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('sci-fi','science_fiction')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('ttriller','thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('.','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('based on radio serial','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' on the early years of hitler','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('sci fi','science_fiction')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('science fiction','science_fiction')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' (30min)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('16 mm film','short')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\[140\]','drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\[144\]','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' for ','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('adventures','adventure')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('kung fu','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('kung-fu','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('martial arts','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('world war ii','war')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('world war i','war')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biography about montreal canadiens star|maurice richard','biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('bholenath movies|cinekorn entertainment','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(volleyball\)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('spy film','spy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('anthology film','anthology')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biography fim','biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('avant-garde','avant_garde')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biker film','biker')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('buddy cop','buddy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('buddy film','buddy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('comedy 2-reeler','comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('films','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('film','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biography of pioneering american photographer eadweard muybridge','biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('british-german co-production','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('bruceploitation','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('comedy-drama adaptation of the mordecai richler novel','comedy-drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('movies by the mob\|knkspl','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('movies','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('movie','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('coming of age','coming_of_age')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('coming-of-age','coming_of_age')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('drama about child soldiers','drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('(( based).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('(( co-produced).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('(( adapted).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('(( about).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('musical b','musical')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('animationchildren','animation|children')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' period','period')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('drama loosely','drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(aquatics|swimming\)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(aquatics|swimming\)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace("yogesh dattatraya gosavi's directorial debut \[9\]",'')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace("war-time","war")
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace("wartime","war")
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace("ww1","war")
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('unknown','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace("wwii","war")
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('psychological','psycho')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('rom-coms','romance')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('true crime','crime')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|007','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('slice of life','slice_of_life')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('computer animation','animation')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('gun fu','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('j-horror','horror')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(shogi|chess\)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('afghan war drama','war drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|6 separate stories','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(30min\)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' (road bicycle racing)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' v-cinema','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('tv miniseries','tv_miniseries')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|docudrama','\|documentary|drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' in animation','|animation')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('((adaptation).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('((adaptated).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('((adapted).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('(( on ).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('american football','sports')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dev\|nusrat jahan','sports')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('television miniseries','tv_miniseries')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(artistic\)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \|direct-to-dvd','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('history dram','history drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('martial art','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('psycho thriller,','psycho thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|1 girl\|3 suitors','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' \(road bicycle racing\)','')
filterE = wiki_plots['GenreCorrected']=="ero"
wiki_plots.loc[filterE,'GenreCorrected']="adult"
filterE = wiki_plots['GenreCorrected']=="music"
wiki_plots.loc[filterE,'GenreCorrected']="musical"
filterE = wiki_plots['GenreCorrected']=="-"
wiki_plots.loc[filterE,'GenreCorrected']=''
filterE = wiki_plots['GenreCorrected']=="comedy–drama"
wiki_plots.loc[filterE,'GenreCorrected'] = "comedy|drama"
filterE = wiki_plots['GenreCorrected']=="comedy–horror"
wiki_plots.loc[filterE,'GenreCorrected'] = "comedy|horror"
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(' ','|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace(',','|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('-','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actionadventure','action|adventure')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actioncomedy','action|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actiondrama','action|drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actionlove','action|love')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actionmasala','action|masala')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actionchildren','action|children')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('fantasychildren\|','fantasy|children')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('fantasycomedy','fantasy|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('fantasyperiod','fantasy|period')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('cbctv_miniseries','tv_miniseries')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dramacomedy','drama|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dramacomedysocial','drama|comedy|social')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dramathriller','drama|thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('comedydrama','comedy|drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dramathriller','drama|thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('comedyhorror','comedy|horror')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('sciencefiction','science_fiction')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('adventurecomedy','adventure|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('animationdrama','animation|drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|\|','|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('muslim','religious')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('thriler','thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('crimethriller','crime|thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('fantay','fantasy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actionthriller','action|thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('comedysocial','comedy|social')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('martialarts','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|\(children\|poker\|karuta\)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('epichistory','epic|history')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('erotica','adult')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('erotic','adult')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('((\|produced\|).+)','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('chanbara','chambara')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('comedythriller','comedy|thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biblical','religious')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biblical','religious')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('colour\|yellow\|productions\|eros\|international','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|directtodvd','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('liveaction','live|action')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('melodrama','drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('superheroes','superheroe')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('gangsterthriller','gangster|thriller')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('heistcomedy','comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('heist','action')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('historic','history')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('historydisaster','history|disaster')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('warcomedy','war|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('westerncomedy','western|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('ancientcostume','costume')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('computeranimation','animation')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dramatic','drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('familya','family')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('familya','family')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dramedy','drama|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('dramaa','drama')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('famil\|','family')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('superheroe','superhero')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('biogtaphy','biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('devotionalbiography','devotional|biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('docufiction','documentary|fiction')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('familydrama','family|drama')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('espionage','spy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('supeheroes','superhero')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('romancefiction','romance|fiction')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('horrorthriller','horror|thriller')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('suspensethriller','suspense|thriller')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('musicaliography','musical|biography')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('triller','thriller')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|\(fiction\)','|fiction')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('romanceaction','romance|action')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('romancecomedy','romance|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('romancehorror','romance|horror')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('romcom','romance|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('rom\|com','romance|comedy')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('satirical','satire')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('science_fictionchildren','science_fiction|children')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('homosexual','adult')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('sexual','adult')

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('mockumentary','documentary')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('periodic','period')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('romanctic','romantic')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('politics','political')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('samurai','martial_arts')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('tv_miniseries','series')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('serial','series')

filterE = wiki_plots['GenreCorrected']=="musical–comedy"
wiki_plots.loc[filterE,'GenreCorrected'] = "musical|comedy"

filterE = wiki_plots['GenreCorrected']=="roman|porno"
wiki_plots.loc[filterE,'GenreCorrected'] = "adult"


filterE = wiki_plots['GenreCorrected']=="action—masala"
wiki_plots.loc[filterE,'GenreCorrected'] = "action|masala"


filterE = wiki_plots['GenreCorrected']=="horror–thriller"
wiki_plots.loc[filterE,'GenreCorrected'] = "horror|thriller"

#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('family','children')
#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('martial_arts','action')
#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('horror','thriller')
#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('war','action')
#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('adventure','action')
#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('science_fiction','action')
#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('western','action')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('noir','black')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('spy','action')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('superhero','action')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('social','')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('suspense','thriller')


filterE = wiki_plots['GenreCorrected']=="drama|romance|adult|children"
wiki_plots.loc[filterE,'GenreCorrected'] = "drama|romance|adult"

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('\|–\|','|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.strip(to_strip='\|')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('actionner','action')
wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.strip()

def merge_similar(gn):
    gn_list = gn.split('|')
    gn_list = sorted(list(set(gn_list)))
    return '|'.join(gn_list)

wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].apply(merge_similar)
#wiki_plots['GenreCorrected']=wiki_plots['GenreCorrected'].str.replace('^$','unknown')

In [0]:
wiki_plots['GenreSplit'] = wiki_plots['GenreCorrected'].str.split('|')
wiki_plots['GenreSplit'] = wiki_plots['GenreSplit'].apply(np.array)

Eliminamos los generos que sean poco frecuentes, solo conservamos los 20 que mas ocurrencias tengan

In [5]:
genres = [genre for sublist in wiki_plots['GenreSplit'].values for genre in sublist]
genres = pd.DataFrame({'Counts':genres})['Counts'].value_counts().to_frame().reset_index().rename(columns={'index':'Genre'})
genres = genres[genres['Genre'] != ''].head(23)
genres = np.array(genres['Genre'])
genres

array(['drama', 'comedy', 'romance', 'action', 'thriller', 'crime',
       'horror', 'western', 'musical', 'science_fiction', 'animation',
       'adventure', 'family', 'war', 'fantasy', 'mystery', 'biography',
       'black', 'history', 'short', 'martial_arts', 'documentary',
       'sports'], dtype=object)

In [0]:
wiki_plots['GenreSplit'] = wiki_plots['GenreSplit'].apply(lambda x: x[np.in1d(x, genres)])
wiki_plots['GenreCorrected'] = wiki_plots['GenreSplit'].apply(lambda x: '|'.join(x))

Obtenemos los vectores de salida de una manera parecida al one hot encoding pero teniendo multiples unos en
cada vector

In [7]:
dummy_clases = wiki_plots.query('GenreCorrected != ""')['GenreSplit'].apply(lambda x: '|'.join(x)).str.get_dummies()
dummy_clases['drama'].values

array([0, 0, 0, ..., 0, 0, 0])

### Limpieza de las tramas

Ahora definimos una función para limpiar los textos de las tramas, primero se reemplazan todas las contracciones
en ingles por la forma completa y se eliminan los simbolos raros

In [0]:
def clean_text(text):
    """
    This function is taken from:
    https://www.kaggle.com/aminejallouli/genre-classification-based-on-wiki-movies-plots
    """
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", "", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\[\d+\]", "", text)
    text = re.sub('[%s\d]' % re.escape("""!"#$%&'()*+,-.:;<=>?@[\]^`{|}~"""), "", text)
    text = re.sub(r"[-\n]", " ", text)
    text = text.strip(' ')
    return text

In [0]:
wiki_plots['FormattedPlot'] = wiki_plots['Plot'].apply(clean_text)

### Codificacion de los textos

Primero se divide el dataset en training y test en un 80/20, luego se tokenizan los textos y se lematizan
despues se usa una codificación de tfidf para poder usar los datos en un algoritmo de aprendizaje

In [0]:
data = wiki_plots.query('GenreCorrected != ""').reset_index()

X_train, X_test, y_train, y_test = train_test_split(data[['Title','FormattedPlot']].values, 
                                                    dummy_clases)

stemmer = WordNetLemmatizer()
for i in range(len(X_train)):
    X_train[i][1] = [stemmer.lemmatize(word) for word in X_train[i][1].split(' ')]
    X_train[i][1] = ' '.join(X_train[i][1])
    
for i in range(len(X_test)):
    X_test[i][1] = [stemmer.lemmatize(word) for word in X_test[i][1].split(' ')]
    X_test[i][1] = ' '.join(X_test[i][1])

tfidf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, stop_words='english')
X_train_vectorized = tfidf.fit_transform(X_train[:,1])
X_test_vectorized = tfidf.transform(X_test[:,1]).toarray()

### Definición de los clasificadores

Tenemos multiples posibles valores en la salida, en otras palabras una pélicula puede pertenecer a varios 
generos, por lo que no vamos a entrenar una softmax que solo prediga una clase, si no que que vamos a 
hacer un one vs rest para cada clase, de modo que para cada genero vamos a calcular la probabilidad de 
que una pélicula pertenesca a este, teniendo asi un clasificador para cada clase.

Se usaran SVM lineales por que son buenos clasificando textos y generalizando

In [0]:
def train_clasifiers(x, y, clases):
    clasifiers = []
    for c in clases:
        clf = OneVsRestClassifier(LinearSVC(random_state=0, max_iter=25000))
        clf.fit(x, y[c].values)
        clasifiers.append(clf)
    return clasifiers

def predict_genres(clasifiers, x, clases):
    pred_clases = []
    for i, clf in enumerate(clasifiers):
        if clf.predict([x])[0] == 1:
            pred_clases.append(clases[i])
    return pred_clases

def predict_vector(clasifiers, x):
    pred = np.zeros((len(clasifiers)))
    for i, c in enumerate(clasifiers):
        pred[i] = c.predict([x])[0]
    return pred

def evaluate_models(clasifiers, X_test, y_test, clases):
    score = np.zeros((len(clases)))
    for i, c in enumerate(clases):
        score[i] = accuracy_score(clasifiers[i].predict(X_test), y_test[c].values)
    return score

def evalueate_model_strict(clasifiers, X_test, y_test):
    acc = 0
    for x, y in zip(X_test, y_test):
        pred_vec =  predict_vector(clasifiers, x)
        if np.array_equal(pred_vec, y):
            acc += 1
    return acc/len(X_test)

In [0]:
clasifiers = train_clasifiers(X_train_vectorized, y_train, genres)

Hacemos una pequeña prueba con algunas pélicula

In [23]:
print("Algunas péliculas predichas:")
table_names = ["Pélicula", "Predicción", "Real"]
table = []
for i in random.choices(list(range(0,1000)), k=25):
    row = [X_test[i][0]]
    row.append('|'.join(predict_genres(clasifiers, X_test_vectorized[i], genres)))
    row.append( data.loc[i, 'GenreCorrected'])
    table.append(row)
display_table = pd.DataFrame(table, columns=table_names)
display_table

Algunas péliculas predichas:


Unnamed: 0,Pélicula,Predicción,Real
0,Kitty Kornered,animation,comedy
1,Head in the Clouds,thriller,drama
2,Brown Sugar,,comedy
3,Delavine Affair,crime,comedy|romance
4,Red Salute,comedy,comedy
5,Take the Lead,drama,drama
6,Thalapathi,,drama
7,Vertigo,drama,comedy
8,Hochchheta ki,,crime
9,Guns of Darkness,,horror


Calculamos la precisón en el test set

In [22]:
acc = evaluate_models(clasifiers, X_test_vectorized, y_test, genres)
acc_table = pd.DataFrame(list(zip(genres, acc)), columns=['Género', 'Precisión'])
acc_table

Unnamed: 0,Género,Precisión
0,drama,0.719324
1,comedy,0.796415
2,romance,0.912416
3,action,0.923055
4,thriller,0.929612
5,crime,0.939959
6,horror,0.960799
7,western,0.984844
8,musical,0.966045
9,science_fiction,0.976537


Precisión media

In [15]:
print(np.mean(acc))

0.9528594781594922


Calculamos la precisión de que prediga exactamente todos los géneros de una pelicula

In [35]:
print(evalueate_model_strict(clasifiers, X_test_vectorized, y_test.values))

0.00816088603905567


## Regresión lineal para predecir el año de salida de una pélicula

Primero preparamos los datos

In [0]:
X_train, X_test, y_train, y_test = train_test_split(wiki_plots[['Title','FormattedPlot']].values, 
                                                    wiki_plots['Release Year'].values)

stemmer = WordNetLemmatizer()
for i in range(len(X_train)):
    X_train[i][1] = [stemmer.lemmatize(word) for word in X_train[i][1].split(' ')]
    X_train[i][1] = ' '.join(X_train[i][1])
    
for i in range(len(X_test)):
    X_test[i][1] = [stemmer.lemmatize(word) for word in X_test[i][1].split(' ')]
    X_test[i][1] = ' '.join(X_test[i][1])

tfidf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, stop_words='english')
X_train_vectorized = tfidf.fit_transform(X_train[:,1])
X_test_vectorized = tfidf.transform(X_test[:,1]).toarray()

Entrenando el modelo

In [11]:
model = LinearRegression()
model.fit(X_train_vectorized, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Haciendo algunas predicciones

In [16]:
print("Algunas predicciones:")
table_names = ["Pélicula", "Predicción", "Real"]
table = []
for i in random.choices(list(range(0,1000)), k=15):
    row = [X_test[i][0]]
    row.append(int(model.predict([X_test_vectorized[i]])[0]))
    row.append(y_test[i])
    table.append(row)
pd.DataFrame(table, columns=table_names)

Algunas predicciones:


Unnamed: 0,Pélicula,Predicción,Real
0,A Doll's House,1976,1973
1,Adventure of the King,2020,2010
2,The Shoes of the Fisherman,2031,1968
3,There's a Girl in My Soup,1949,1970
4,La Bohème,1946,1926
5,A Simple Plan,1990,1998
6,Chicken Run,1989,2000
7,"Murder, Inc.",1951,1960
8,Home to Stay,1947,1978
9,The Number 23,1995,2007


In [17]:
pred = model.predict(X_test_vectorized)
print(mean_squared_error(y_test, pred))

526.668596160757
