#                           Предсказание кассовых сборов фильмов

## Часть 2: Подбор и обучение моделей

In [1]:
%config IPCompleter.greedy=True

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle


import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer

Считываем данные предобработанные в предыдущей части и выделим таргет, а также остальные фичи

In [3]:
df = pd.read_csv("data/preprocessed_train.csv")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,belongs_to_collection,budget,popularity,runtime,revenue,release_year,description,Action,Adventure,...,original_language_ro,original_language_ru,original_language_sr,original_language_sv,original_language_ta,original_language_te,original_language_tr,original_language_ur,original_language_vi,original_language_zh
0,0,1,14000000,6.575393,93,4,2015,"When Lou, who has become the ""father of the In...",0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,40000000,8.248895,113,5,2004,Mia Thermopolis is now a college graduate and ...,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,3300000,64.299990,105,4,2014,"Under the direction of a ruthless instructor, ...",0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,1200000,3.174936,122,4,2012,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,1.148070,118,3,2009,Marine Boy is the story of a former national s...,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,0,0,9.853270,102,3,1994,Military men Rock Reilly and Eddie Devane are ...,0,0,...,0,0,0,0,0,0,0,0,0,0
2996,2996,0,0,3.727996,102,2,2013,Three girls in 1980s Stockholm decide to form ...,0,0,...,0,0,0,1,0,0,0,0,0,0
2997,2997,0,65000000,14.482345,120,5,1996,"Samantha Caine, suburban homemaker, is the ide...",1,0,...,0,0,0,0,0,0,0,0,0,0
2998,2998,0,42000000,15.725542,90,6,2004,Reuben Feffer is a guy who's spent his entire ...,0,0,...,0,0,0,0,0,0,0,0,0,0


Добавим текстовую обработку данных

In [5]:
nltk.download('stopwords')

def preprocess_text(text: str) -> str:
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    remove_extra_symb: list[str] = re.sub(r'[^\w^\s]+', '', str(text)).lower().split()
    return ' '.join([stemmer.stem(w) for w in remove_extra_symb if w not in stop_words])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df['description'] = df['description'].apply(preprocess_text)
df = df.drop(columns=['Unnamed: 0'])

In [7]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

bow = CountVectorizer()
x_train = bow.fit_transform(train_df['description'])
y_train = train_df['revenue']
x_test = bow.transform(test_df['description'])
y_test = test_df['revenue']

text_model = SVC(kernel='linear')
text_model.fit(x_train, y_train)

In [8]:
x_features = df.drop(['revenue', 'description'], axis=1)

In [9]:
nx_test = bow.transform(df['description'])
predicted = text_model.predict(nx_test)
x_features['text_prediction'] = predicted

In [10]:
scaler = MinMaxScaler()
scaler.fit(x_features)
x_features = scaler.transform(x_features)

In [11]:
y_df = df['revenue']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_df, train_size=0.9)

Обучим несколько различных моделей классификации с разными параметрами и будем считать ROC_AUC Score, чтобы определить их качество

In [13]:
model_list = [LogisticRegression(max_iter=2000), KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=5), MultinomialNB(), DecisionTreeClassifier()]

In [14]:
def learn_model_and_show_roc_aus_score(model, to_print = True):
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_test)
    if to_print:
        print(f"{type(model).__name__} ROC_AUC Score is: {roc_auc_score(y_test,y_pred,average='weighted',multi_class='ovo'):.4f}")
    return roc_auc_score(y_test,y_pred,average='weighted',multi_class='ovo')

In [15]:
for model in model_list:
    learn_model_and_show_roc_aus_score(model)

LogisticRegression ROC_AUC Score is: 0.8957
KNeighborsClassifier ROC_AUC Score is: 0.6483
KNeighborsClassifier ROC_AUC Score is: 0.7917
MultinomialNB ROC_AUC Score is: 0.7036
DecisionTreeClassifier ROC_AUC Score is: 0.9228


Также обучим Random_Forest_Classifier, подбирая параметры max_depth и n_estimators

In [16]:
best_model = RandomForestClassifier(max_depth=60,n_estimators=850)
learn_model_and_show_roc_aus_score(best_model)
pickle.dump(model, open('models/prediction_model_text.pkl', 'wb'))
pickle.dump(scaler, open('models/scaler_model_text.pkl', 'wb'))
pickle.dump(scaler, open('models/bow.pkl', 'wb'))

RandomForestClassifier ROC_AUC Score is: 0.9732


In [None]:
best_model = None
best_score = 0
for m_depth in range(40,400,20):
    for n_est in range(200,1000,20):
        model_score = learn_model_and_show_roc_aus_score(RandomForestClassifier(max_depth=m_depth,n_estimators=n_est),to_print=False)
        if best_score < model_score:
            best_score = model_score
            print(f"Random Forest with depth = {m_depth} and n_estimators = {n_est} has ROC_AUC score of: {model_score:.4f}")

Random Forest with depth = 40 and n_estimators = 200 has ROC_AUC score of: 0.9707
Random Forest with depth = 40 and n_estimators = 220 has ROC_AUC score of: 0.9726
Random Forest with depth = 40 and n_estimators = 240 has ROC_AUC score of: 0.9759
Random Forest with depth = 40 and n_estimators = 560 has ROC_AUC score of: 0.9760
