#                           Предсказание кассовых сборов фильмов

## Часть 3: подбор параметров и обучение текстовой модели

In [1]:
%config IPCompleter.greedy=True

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle
import os

from utils import preprocess_text
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Считываем данные предобработанные в предыдущей части и выделим таргет, а также остальные фичи

In [3]:
df = pd.read_csv(os.path.join("data", "preprocessed_train.csv"))

In [4]:
df

Unnamed: 0.1,Unnamed: 0,belongs_to_collection,budget,popularity,runtime,revenue,release_year,description,Action,Adventure,...,original_language_ro,original_language_ru,original_language_sr,original_language_sv,original_language_ta,original_language_te,original_language_tr,original_language_ur,original_language_vi,original_language_zh
0,0,1,14000000,6.575393,93,4,2015,"When Lou, who has become the ""father of the In...",0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,40000000,8.248895,113,5,2004,Mia Thermopolis is now a college graduate and ...,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,3300000,64.299990,105,4,2014,"Under the direction of a ruthless instructor, ...",0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,1200000,3.174936,122,4,2012,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,1.148070,118,3,2009,Marine Boy is the story of a former national s...,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,0,0,9.853270,102,3,1994,Military men Rock Reilly and Eddie Devane are ...,0,0,...,0,0,0,0,0,0,0,0,0,0
2996,2996,0,0,3.727996,102,2,2013,Three girls in 1980s Stockholm decide to form ...,0,0,...,0,0,0,1,0,0,0,0,0,0
2997,2997,0,65000000,14.482345,120,5,1996,"Samantha Caine, suburban homemaker, is the ide...",1,0,...,0,0,0,0,0,0,0,0,0,0
2998,2998,0,42000000,15.725542,90,6,2004,Reuben Feffer is a guy who's spent his entire ...,0,0,...,0,0,0,0,0,0,0,0,0,0


Добавим обработку текстовых данных

In [5]:
df['description'] = df['description'].apply(preprocess_text)
df.drop(columns=['Unnamed: 0'], inplace=True)

Разделим сразу все данные на train и test

In [6]:
TEST_SIZE = 0.1
RANDOM_STATE = 42

train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

Обучим текстовую модель на части train выборки, чтобы при обучении основной модели ей попадались и те элементы, которые видела текстовая модель при обучении, и те, которые она не видела

In [7]:
text_train_df, _ = train_test_split(train_df, test_size=0.5, random_state=69)

bow = CountVectorizer()
text_x_train = bow.fit_transform(text_train_df['description'])
text_y_train = text_train_df['revenue']

text_model = SVC(kernel='linear')
text_model.fit(text_x_train, text_y_train)

Добавим предсказания текстовой модели в качестве фичи для основной модели

In [8]:
main_x = df.drop(columns=['revenue', 'description'])
main_x['text_prediction'] = text_model.predict(bow.transform(df['description']))

main_y = df['revenue']

Добавим MinMax scaler

In [9]:
scaler = MinMaxScaler()
scaler.fit(main_x)
main_x = scaler.transform(main_x)

In [10]:
main_x_train, main_x_test, main_y_train, main_y_test = train_test_split(main_x, main_y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [11]:
_ , main_x_train, _, main_y_train = train_test_split(main_x_train, main_y_train, test_size=0.5, random_state=69) 

Обучим несколько различных моделей классификации с разными параметрами и будем считать ROC_AUC Score, чтобы определить их качество

In [12]:
model_list = [LogisticRegression(max_iter=2000), KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=5), MultinomialNB(), DecisionTreeClassifier()]

In [13]:
def train_model_and_show_roc_aus_score(model, to_print = True):
    model.fit(main_x_train, main_y_train)
    y_pred = model.predict_proba(main_x_test)
    score = roc_auc_score(main_y_test, y_pred, average='weighted', multi_class='ovo')
    if to_print:
        print(f"{type(model).__name__} ROC_AUC Score is: {score:.4f}")
    return score

In [14]:
for model in model_list:
    train_model_and_show_roc_aus_score(model)

LogisticRegression ROC_AUC Score is: 0.7266
KNeighborsClassifier ROC_AUC Score is: 0.5784
KNeighborsClassifier ROC_AUC Score is: 0.6248
MultinomialNB ROC_AUC Score is: 0.6753
DecisionTreeClassifier ROC_AUC Score is: 0.5994


Также обучим Random_Forest_Classifier, подбирая параметры max_depth и n_estimators

In [15]:
best_model = None
best_score = 0
for m_depth in range(40,400,20):
    for n_est in range(200,1000,20):
        model_score = train_model_and_show_roc_aus_score(RandomForestClassifier(max_depth=m_depth, n_estimators=n_est), to_print=False)
        if best_score < model_score:
            best_score = model_score
            print(f"Random Forest with depth = {m_depth} and n_estimators = {n_est} has ROC_AUC score of: {model_score:.4f}")
            best_model = RandomForestClassifier(max_depth=m_depth,n_estimators=n_est)

Random Forest with depth = 40 and n_estimators = 200 has ROC_AUC score of: 0.8057
Random Forest with depth = 40 and n_estimators = 220 has ROC_AUC score of: 0.8109
Random Forest with depth = 40 and n_estimators = 280 has ROC_AUC score of: 0.8132
Random Forest with depth = 40 and n_estimators = 320 has ROC_AUC score of: 0.8144
Random Forest with depth = 40 and n_estimators = 400 has ROC_AUC score of: 0.8209
Random Forest with depth = 60 and n_estimators = 780 has ROC_AUC score of: 0.8211


По итогам побдора, наиболее эффективной оказалась указанная модель

In [26]:
best_model = RandomForestClassifier(max_depth=60, n_estimators=780)
train_model_and_show_roc_aus_score(best_model)
pickle.dump(best_model, open(os.path.join('models', 'prediction_model_text.pkl'), 'wb'))
pickle.dump(scaler, open(os.path.join('models', 'scaler_model_text.pkl'), 'wb'))
pickle.dump(text_model, open(os.path.join('models', 'text_model.pkl'), 'wb'))
pickle.dump(bow, open(os.path.join('models', 'bow.pkl'), 'wb'))

RandomForestClassifier ROC_AUC Score is: 0.8145
