In [5]:
import pysrt
import pandas as pd
import numpy as np
import os
import nltk
import joblib
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import make_column_transformer
from scipy.sparse import hstack, csr_matrix
from nltk.stem import WordNetLemmatizer

In [6]:
movies_df = pd.read_excel(r'C:\Users\AB54\Desktop\Data\subtitles\movies_labels.xlsx')

subtitles_list = []
for index, row in movies_df.iterrows():
    filename = os.path.join('C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/', row['Movie'] + ".srt")
    if os.path.exists(filename):
        subs = pysrt.open(filename, encoding='windows-1252')
        text = ""
        for sub in subs:
            text += sub.text + " "
        subtitles_list.append(text)
    else:
        print(f"File not found: {filename}")
        movies_df.drop(index, inplace=True)

# Лемматизация
lemmatizer = WordNetLemmatizer()

subtitles_lemmatized = []
for subtitles in subtitles_list:
    words = nltk.word_tokenize(subtitles.lower())
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    subtitles_lemmatized.append(' '.join(lemmatized_words))

y = movies_df['Level']
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(subtitles_lemmatized)

# Преобразование матрицы с помощью TF-IDF
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X)

# Разделение данных на тренировочную, валидационную и тестовую выборки
X_trainval, X_test, y_trainval, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/The Secret Life of Pets.en.srt
File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/Up (2009).srt
File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/Glass Onion.srt
File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/Matilda(2022).srt
File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/Bullet train.srt
File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/Thor: love and thunder.srt
File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/Lightyear.srt
File not found: C:/Users/AB54/Desktop/Data/subtitles/Subtitles_all/Subtitles/The Grinch.srt


In [8]:
models = {'Логистическая регрессия': LogisticRegression(),                    'K-ближайших соседей': KNeighborsClassifier(),  
          'Метод опорных векторов': SVC(),                     'Решающее дерево': DecisionTreeClassifier(),  
          'Случайный лес': RandomForestClassifier(),                     'Градиентный бустинг': GradientBoostingClassifier(),  
          'Нейронные сети': MLPClassifier(),                     'Мультиномиальный наивный Байес': MultinomialNB()} 
results = {'model': [], 'cv_mean_accuracy': [], 'cv_std_accuracy': [],             
           'train_accuracy': [], 'val_accuracy': [], 'train_time': [],           'n_features': [], 'n_samples': [], 'precision': [], 'recall': [], 'f1-score': []} 

for name, model in models.items(): 
    # кросс-валидация    
    start_time = time.time() 
    scores = cross_val_score(model, X_trainval, y_trainval, cv=5, scoring='accuracy')    
    end_time = time.time() 
    train_time = end_time - start_time     
    results['model'].append(name)    
    results['cv_mean_accuracy'].append(np.mean(scores)) 
    results['cv_std_accuracy'].append(np.std(scores))     
    # оценка качества модели на тренировочной и валидационной выборках        
    model.fit(X_train, y_train) 
    train_score = model.score(X_train, y_train)        
    val_score = model.score(X_val, y_val) 
    results['train_accuracy'].append(train_score) 
    results['val_accuracy'].append(val_score)     
   # оценка precision, recall, f1-score     
    y_pred = model.predict(X_val)    
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='micro')    
    results['precision'].append(precision) 
    results['recall'].append(recall)
    results['f1-score'].append(f1) 
        # оценка времени обучения, размерности входных данных 
    n_features = X.shape[1]    
    n_samples = X.shape[0] 
    results['train_time'].append(train_time) 
    results['n_features'].append(n_features)    
    results['n_samples'].append(n_samples) 
# создаем таблицу 
df_results = pd.DataFrame.from_dict(results)
df_results.set_index('model', inplace=True) 
df_results = df_results.round(4) 
display(df_results) 
# выводим лучшую модель 
best_model = df_results['val_accuracy'].idxmax()
print(f"\nЛучшая модель на валидационной выборке: {best_model}")

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0_level_0,cv_mean_accuracy,cv_std_accuracy,train_accuracy,val_accuracy,train_time,n_features,n_samples,precision,recall,f1-score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Логистическая регрессия,0.5105,0.0567,0.8129,0.4681,7.0308,28923,233,0.4681,0.4681,0.4681
K-ближайших соседей,0.3603,0.0228,0.4245,0.3404,8.4069,28923,233,0.3404,0.3404,0.3404
Метод опорных векторов,0.5754,0.0417,0.9424,0.5745,10.8393,28923,233,0.5745,0.5745,0.5745
Решающее дерево,0.5057,0.0393,1.0,0.4255,12.1703,28923,233,0.4255,0.4255,0.4255
Случайный лес,0.543,0.0455,1.0,0.4894,14.2689,28923,233,0.4894,0.4894,0.4894
Градиентный бустинг,0.6344,0.0715,1.0,0.5957,181.9885,28923,233,0.5957,0.5957,0.5957
Нейронные сети,0.597,0.0666,1.0,0.4894,457.4831,28923,233,0.4894,0.4894,0.4894
Мультиномиальный наивный Байес,0.4192,0.0485,0.446,0.3404,504.7342,28923,233,0.3404,0.3404,0.3404



Лучшая модель на валидационной выборке: Градиентный бустинг


In [10]:
param_grid = {
    'n_estimators': [30, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# задаем модель
model = GradientBoostingClassifier()

# применяем поиск по сетке с кросс-валидацией
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_trainval, y_trainval)

# выводим лучшие параметры и результаты кросс-валидации
print("Лучшие гиперпараметры: ", grid_search.best_params_)
print("Лучшее значение accuracy на кросс-валидации: ", grid_search.best_score_)




Лучшие гиперпараметры:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Лучшее значение accuracy на кросс-валидации:  0.6341394025604552


In [11]:
# обучаем модель на объединенной тренировочной и валидационной выборках с лучшими параметрами
model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=3)
model.fit(X_trainval, y_trainval)

# оцениваем качество модели на тестовой выборке
test_score = model.score(X_test, y_test)
y_pred = model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='micro')

# выводим метрики качества модели
print("Accuracy на тестовой выборке: {:.4f}".format(test_score))

Accuracy на тестовой выборке: 0.6596
