In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as skl
from sklearn import tree
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import scipy.sparse as scsp

In [2]:
filename = 'Cleaned_Data.csv'
data = pd.read_csv(filename)

In [3]:
data3 = pd.read_csv('Movie_Data3.csv')

In [4]:
data3 = data3[~((data3['Genre'].isnull()) | (data3['Storyline'].isnull()))]

In [5]:
data3 = data3.reset_index(drop=True)

In [6]:
# Tạo danh sách từ phổ thông
stopwords_list = stopwords.words('english')
stopwords_list.append(" ")
stopwords_list.append("")
stopwords_list.extend([chr(i) for i in range(ord('a'), ord('z') + 1)])

def split_into_words_and_remove_stopwords(text, stopwords):
    # Split text using various separators (space, period, comma)
    words = re.split(r'\s+|[,\.:!?"&]', text)
    # Remove specified stopwords from the list
    words = [word.lower() for word in words if word.lower() not in stopwords]
    return ' '.join(words)

# Tách + xoá các từ phổ thông
data['New Storyline'] = data['Storyline'].apply(lambda x: split_into_words_and_remove_stopwords(x, stopwords_list))

# Làm việc với tập test
data3['New Storyline'] = data3['Storyline'].apply(lambda x: split_into_words_and_remove_stopwords(x, stopwords_list))

In [11]:
def VietAnh(genre):
    # Tạo biến mục tiêu
    data['Target'] = data['Genre'].str.contains(genre)

    # Tách hai phần True và False
    data_true = data[data['Target']].reset_index(drop=True)
    data_false = data[~data['Target']].reset_index(drop=True)

    # Chia Train và Test
    # LÀM VIỆC BÊN NGOÀI
    n_true = int(data_true.shape[0] * 0.7)
    choice_true = np.random.choice(data_true.shape[0], n_true, replace=False)
    n_false = int(data_false.shape[0] * 0.7)
    choice_false = np.random.choice(data_false.shape[0], n_false, replace=False)

    data_true_train = data_true[data_true.index.isin(choice_true)]
    data_false_train = data_false[data_false.index.isin(choice_false)]
    data_full_train = pd.concat([data_true_train, data_false_train]).sample(frac=1).reset_index(drop=True)

    data_true_test = data_true[~data_true.index.isin(choice_true)]
    data_false_test = data_false[~data_false.index.isin(choice_false)]
    data_full_test = pd.concat([data_true_test, data_false_test]).sample(frac=1).reset_index(drop=True)

    data_train = data_full_train[data_full_train.columns[~data_full_train.columns.isin(['Target'])]]
    y_train = data_full_train['Target']

    data_test = data_full_test[data_full_test.columns[~data_full_test.columns.isin(['Target'])]]
    y_test = data3['Genre'].str.contains(genre)

    # Mã hoá Storyline
    tfidf_vectorizer = TfidfVectorizer()
    X_train = tfidf_vectorizer.fit_transform(data_train['New Storyline'])
    X_test = tfidf_vectorizer.transform(data3['New Storyline'])

    # Tách True và False trong bộ Train
    X_true_train = X_train[y_train]
    X_false_train = X_train[~y_train]

    index_true = pd.DataFrame({'True': np.arange(X_true_train.shape[0])})
    index_false = pd.DataFrame({'False': np.arange(X_false_train.shape[0])})

    # Chia bộ Train thành 20 phần
    choices = []
    row_num = int(data_false.shape[0] / data_true.shape[0]) + 1
    list_choice = np.arange(X_false_train.shape[0])
    n = X_false_train.shape[0] // row_num
    for i in range(row_num-1):
        choice = np.random.choice(list_choice, n, replace=False)
        choices.append(choice)
        list_choice = list_choice[~np.isin(list_choice, choice)]
    choices.append(np.array(list_choice))

    # Huấn luyện 20 cái mô hình
    n = X_true_train.shape[0]
    models = []
    for choice in choices:
        choice_true = np.random.choice(n, 3*n//4, replace=False)
        choice_false = choice

        index_true_train = index_true['True'].isin(choice_true)
        index_false_train = index_false['False'].isin(choice_false)

        X_true_train_choice = X_true_train[index_true_train]
        X_false_train_choice = X_false_train[index_false_train]

        X_full_train = scsp.vstack((X_true_train_choice, X_false_train_choice))
        X_train = X_full_train

        y_true = pd.DataFrame({'True': np.array([True] * X_true_train_choice.shape[0])})
        y_false = pd.DataFrame({'True': np.array([False] * X_false_train_choice.shape[0])})
        y_train = pd.concat([y_true, y_false])
        y_train = 2 * y_train['True'] - 1
        
        model = svm.SVC(kernel='linear')
        model.fit(X_train, y_train)
        models.append(model)

    # Chạy thử tập Test
    y_hat = models[0].predict(X_test)
    for i in range(1, row_num):
        y_pre = 2 * (models[i].predict(X_test)) - 1
        y_hat += y_pre
    y_hat = (y_hat >= 0)
    
    return {
        'Accuracy' : skl.metrics.accuracy_score(y_test, y_hat),
        'Recall' : skl.metrics.recall_score(y_test, y_hat),
        'Confusion Matrix' : pd.crosstab(y_hat, y_test, rownames=['Predicted'])
    }

In [12]:
list_genre = np.array(list(set(', '.join(data['Genre']).split(', '))))

In [13]:
for genre in list_genre:
    print(genre, data3['Genre'].str.contains(genre).sum())

History 10
Music 30
Family 63
Sport 13
Action 144
Mystery 37
Drama 321
Fantasy 50
Horror 61
Comedy 241
Biography 20
Thriller 98
Adventure 97
Musical 10
Animation 49
Romance 88
Documentary 33
Western 1
News 1
Sci-Fi 45
War 6
Crime 118


In [None]:
genre_result = {}
for genre in list_genre:
    genre_result[genre] = VietAnh(genre)
    print(genre)

History
Music
Family
Sport
Action
Mystery
Drama
Fantasy


In [None]:
for i in genre_result.items():
    print(i[0], i[1]['Accuracy'], i[1]['Recall'])

In [None]:
data2 = pd.DataFrame(genre_result)

data3 = pd.DataFrame({'Accuracy': data2.iloc[0],
                      'Recall' : data2.iloc[1]})

data_melt = data3.reset_index().melt(id_vars='index',
                       value_vars=['Accuracy', 'Recall'],
                       var_name='Score',
                       value_name='Value')
data_groupby = data_melt.groupby(['index', 'Score']).mean().reset_index()

sns.barplot(data_groupby, x='index', y='Value', hue='Score')
plt.title('Score of Each Genre')
plt.xticks(rotation=90)
figure = plt.gcf() # get current figure
figure.set_size_inches(15, 10)
# when saving, specify the DPI
plt.savefig("Score of Each Genre.jpg", dpi = 200);
# plt.savefig('/kaggle/working/Score of Each Genre.jpg');