#                           Предсказание кассовых сборов фильмов

## Часть 2: Подбор и обучение моделей

In [1]:
%config IPCompleter.greedy=True

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

Считываем данные предобработанные в предыдущей части и выделим таргет, а также остальные фичи

In [3]:
df = pd.read_csv("data/preprocessed_train.csv")

In [4]:
x_features = df.drop(['revenue'], axis=1)

In [5]:
x_features = x_features.drop(['description'], axis=1) #for now only

In [6]:
scaler = MinMaxScaler()
scaler.fit(x_features)
x_features = scaler.transform(x_features)

In [7]:
y_df = df['revenue']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_df, train_size=0.9)

Обучим несколько различных моделей классификации с разными параметрами и будем считать F1 Score, чтобы определить их качество

In [9]:
model_list = [LogisticRegression(max_iter=2000), KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=5), SVC(kernel='linear'), SVC(kernel='poly'), SVC(kernel='rbf'), MultinomialNB(), DecisionTreeClassifier()]

In [10]:
def learn_model_and_show_f1_score(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f"{type(model).__name__} F1 Score is: {f1_score(y_true=y_test, y_pred=y_pred, average='weighted'):.4f}")

In [11]:
for model in model_list:
    learn_model_and_show_f1_score(model)

LogisticRegression F1 Score is: 0.3136
KNeighborsClassifier F1 Score is: 0.2293
KNeighborsClassifier F1 Score is: 0.2416
SVC F1 Score is: 0.3202
SVC F1 Score is: 0.2528
SVC F1 Score is: 0.2493
MultinomialNB F1 Score is: 0.2383
DecisionTreeClassifier F1 Score is: 0.2908


Также обучим Random_Forest_Classifier, подбирая параметры с помозью Grid_Seacrh.

In [12]:
random_forest_classifier_params = {
    'n_estimators': [5,100,500,1000,10000],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 100, 150, 200, 250, 300, 400, 500]
}

С помощью GridSearch подобрали наилучшие параметры, на которых и обучим модель

In [13]:
%%time
search = GridSearchCV(RandomForestClassifier(), random_forest_classifier_params)
search.fit(x_train, y_train)
search.best_estimator_

CPU times: total: 2h 20min 53s
Wall time: 2h 22min 21s


In [14]:
learn_model_and_show_f1_score(search.best_estimator_)


RandomForestClassifier F1 Score is: 0.3465


In [26]:
learn_model_and_show_f1_score(RandomForestClassifier(max_depth=50, n_estimators=1000))

RandomForestClassifier F1 Score is: 0.3654
