#                           Предсказание кассовых сборов фильмов

Для предсказания будем использовать датасет с Каггла: TMDB Box Office Prediction - https://www.kaggle.com/c/tmdb-box-office-prediction

In [1]:
%config IPCompleter.greedy=True

In [2]:
# !pip install -r requirements

Загрузим все необходимые библиотеки

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
import re

In [4]:
df = pd.read_csv("data/train.csv")

In [5]:
df.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


## Часть 1: предобработка данных и подготовка к обучению

In [6]:
def is_int(s):
    try: 
        int(s)
    except ValueError:
        return False
    else:
        return True

In [7]:
def parse_string_line_list(string):
    result = list()
    current_number = ""
    for x in string:
        if x.isdigit():
            current_number += x
        else:
            if current_number != "":
                if is_int(current_number):
                    result.append(int(current_number))
            current_number = ""
    if current_number != "":
                result.append(int(current_number))
    return result

In [8]:
def first_actual_number_in_string(string):
    current_number = ""
    for x in string:
        if x.isdigit():
            current_number += x
        else:
            if current_number != "":
                return int(current_number)
    return 0

Заменим все значения NaN на нули и применим к колонкам разного типа соответствующий парсер.

In [9]:
df["genres"] = df["genres"].replace([np.nan, -np.inf], "0")
df["belongs_to_collection"] = df["belongs_to_collection"].replace([np.nan, -np.inf], "0")
df["genres"] = df["genres"].apply(parse_string_line_list)
df["belongs_to_collection"] = df["belongs_to_collection"].apply(first_actual_number_in_string)

Удалим ненужные колонки (пока что считаем, что название фильма нерелевантно)

In [10]:
del df['homepage']
del df['imdb_id']
del df['original_title']
del df['title']
del df['overview']
del df['poster_path']

In [11]:
def tagline_transform(tagline):
    if tagline == 0:
        return 0
    return 1

Пока что добавим индикатор наличия tagline*, но, скорее всего, нам это не пригодится

*\* tagline — рекламный слоган, сопровождающий картину*

In [12]:
df["tagline"] = df["tagline"].replace([np.nan, -np.inf], 0)
df["tagline"] = df["tagline"].apply(tagline_transform)

Все фильмы уже выпущены, так что статус тоже можно удалить

In [13]:
del df['status']

Ключевые слова и кинокомпании-создатели можно обработатиь тем же парсером, что и жанры

In [14]:
df["Keywords"] = df["Keywords"].replace([np.nan, -np.inf], "0")
df["Keywords"] = df["Keywords"].apply(parse_string_line_list)
df["production_companies"] = df["production_companies"].replace([np.nan, -np.inf], "0")
df["production_companies"] = df["production_companies"].apply(parse_string_line_list)

Языки как категориальный признак заменим на их id

In [15]:
language_code_dict = dict()
language_code_dict['value'] = 1
def decategorise_language(lang):
    if lang in language_code_dict.keys():
        return language_code_dict[lang]
    else:
        language_code_dict[lang] = language_code_dict['value']
        language_code_dict['value'] += 1
        return language_code_dict['value']-1

In [16]:
df["original_language"] = df["original_language"].apply(decategorise_language)

Из даты выпуска оставим только год, так как остальное будет пренебрежимо слабо влиять на предсказание

In [17]:
def date_release_to_year(date):
    year = int(date[-2:])
    if year > 20: # Dataset is from 5 years ago
        return (1900 + year)
    else:
        return (2000 + year)

In [18]:
df["release_date"] = df["release_date"].apply(date_release_to_year)

In [19]:
def parse_language_list(language_string):
    result = list()
    start_indexes = [m.start() for m in re.finditer('iso_639_1', language_string)]
    for index in start_indexes:
        result.append(decategorise_language(language_string[index + len("'iso_639_1': '")-1:index + len("'iso_639_1': '") + 1]))
    return result

In [20]:
df["spoken_languages"] = df["spoken_languages"].replace([np.nan, -np.inf], "'iso_639_1': '00' ")
df["spoken_languages"] = df["spoken_languages"].apply(parse_language_list)

Похожим на обработку языков способом обработаем остальные списковые данные

In [21]:
country_code_dict = dict()
country_code_dict['value'] = 0
def decategorise_country(country):
    if country in country_code_dict.keys():
        return country_code_dict[country]
    else:
        country_code_dict[country] = country_code_dict['value']
        country_code_dict['value'] += 1
        return country_code_dict['value']-1

In [22]:
def parse_country_list(country_string):
    result = list()
    start_indexes = [m.start() for m in re.finditer("'iso_3166_1'", country_string)]
    for index in start_indexes:
        result.append(decategorise_country(country_string[index + len("'iso_3166_1': '"):index + len("'iso_3166_1': '") + 2]))
    return result

In [23]:
df["production_countries"] = df["production_countries"].replace([np.nan, -np.inf], "'iso_3166_1': '00' ")
df["production_countries"] = df["production_countries"].apply(parse_country_list)

In [24]:
def parse_cast_list(cast_string):
    result = list()
    start_indexes = [m.start() for m in re.finditer("'id'", cast_string)]
    for index in start_indexes:
        result.append(first_actual_number_in_string(cast_string[index + len("'id':"):]))
    return result

In [25]:
df["cast"] = df["cast"].replace([np.nan, -np.inf], "'id': 0 ")
df["cast"] = df["cast"].apply(parse_cast_list)

In [26]:
df["crew"] = df["crew"].replace([np.nan, -np.inf], "'id': 0 ")
df["crew"] = df["crew"].apply(parse_cast_list)

Заметим, что индексы 0 и 00 кодируют в разных колонках отсутствие каких-либо изначальных данных

In [27]:
df.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,original_language,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,tagline,Keywords,cast,crew,revenue
0,1,313576,14000000,[35],1,6.575393,"[4, 60, 8411]",[0],2015,93.0,[1],1,"[4379, 9663, 11830, 179431]","[52997, 64342, 54729, 36801, 54812, 94098, 115...","[1449071, 3227, 347335, 347335, 57822, 1451395...",12314651
1,2,107674,40000000,"[35, 18, 10751, 10749]",1,8.248895,[2],[0],2004,113.0,[1],1,"[2505, 4263, 6038, 13072]","[1813, 5823, 1210, 655, 33656, 62064, 68287, 1...","[1201, 1214, 4500, 8851, 12970, 38415, 38416, ...",95149435
2,3,0,3300000,[18],1,64.29999,"[2266, 3172, 32157]",[0],2014,105.0,[1],1,"[1416, 1523, 1640, 2176, 14512, 14819, 33896, ...","[996701, 18999, 129104, 970216, 223012, 159366...","[494, 6745, 9250, 23783, 23786, 53181, 52443, ...",13092000
3,4,0,1200000,"[53, 18]",2,3.174936,[0],[1],2012,122.0,"[1, 2]",0,"[10092, 10540, 11734, 14536, 14636, 208364, 22...","[35068, 85047, 1021524, 1093644, 86033, 92686,...","[95510, 95510, 95510]",16000000
4,5,0,0,"[28, 53]",3,1.14807,[0],[2],2009,118.0,[3],0,[0],"[84751, 64453, 84752, 1130534]","[84749, 84749]",3923970


Колонка revenue является нашим таргетом

In [28]:
data = df.drop("revenue", axis= 1)
target = df["revenue"]