In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import gensim
from sklearn.model_selection import train_test_split

### Load Data

In [73]:
df_data = pd.read_csv('data_processed.csv').dropna()
le = preprocessing.LabelEncoder()
df_data['class'] = le.fit_transform(df_data.genre)
df_data

Unnamed: 0,title,genre,summary,clean_text,clean_title,class
0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drown wednesday first trustee among morrow day...,drown wednesday,1
1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book open jason awakens school bus unable reme...,lose hero,1
2,Thendara House,fantasy,The novel concerns the dwelling of the Darkov...,novel concern dwell darkovan order renunciates...,thendara house,1
3,The Thief,fantasy,"Gen is released from prison by the magus, the...",gen release prison magus king scholar magus fi...,thief,1
4,The Sweet Far Thing,fantasy,The prologue begins with two men who are sear...,prologue begin two men search river london thr...,sweet far thing,1
...,...,...,...,...,...,...
2995,The Time Traveler's Wife,fantasy,This is the extraordinary love story of Clare ...,extraordinary love story clare henry meet clar...,time traveler wife,1
2996,Fantastic Beasts and Where to Find Them: The O...,fantasy,J.K. Rowling's screenwriting debut is captured...,j k rowling screenwriting debut capture exciti...,fantastic beast find original screenplay,1
2997,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begin find ...,charlie chocolate factory,1
2998,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rise love dimitri dimitri might love tasha mas...,frostbite,1


In [74]:
le.classes_

array(['crime', 'fantasy', 'history', 'horror', 'science', 'thriller'],
      dtype=object)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(df_data[['clean_text', 'clean_title']],
                                                   df_data['class'], random_state = 42,
                                                   test_size = 0.2, stratify=df_data['class'])

### CounterVectorize

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_countvec = count_vect.fit_transform(list(X_train.clean_text)).toarray()
X_test_countvec = count_vect.transform(list(X_test.clean_text)).toarray()

In [77]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_countvec, y_train)
y_pred = clf.predict(X_test_countvec)

In [78]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print('Accuracy is {}'.format(accuracy_score(y_test, y_pred)))
confusion_matrix(y_test, y_pred)

Accuracy is 0.7212020033388982


array([[74,  0,  6,  9,  1, 10],
       [ 2, 61, 13, 15,  5,  4],
       [ 3,  2, 83,  6,  1,  5],
       [ 6,  3,  3, 71,  4, 12],
       [ 0,  1,  5, 11, 80,  3],
       [15,  1,  6, 13,  2, 63]], dtype=int64)

### TF-IDF

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer()
X_train_tfidf = tfidf_vector.fit_transform(list(X_train.clean_text))
X_test_tfidf = tfidf_vector.transform(list(X_test.clean_text))

In [80]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

In [81]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print('Accuracy is {}'.format(accuracy_score(y_test, y_pred)))
confusion_matrix(y_test, y_pred)

Accuracy is 0.7395659432387313


array([[80,  0,  5,  4,  0, 11],
       [ 3, 70, 11,  9,  4,  3],
       [ 2,  3, 86,  2,  1,  6],
       [ 7,  4,  5, 57,  4, 22],
       [ 0,  1,  6,  9, 81,  3],
       [ 9,  1,  6, 14,  1, 69]], dtype=int64)

### Word2Vec

In [167]:
import gensim
from nltk import word_tokenize, sent_tokenize

sent_words = [word_tokenize(w) for w in list(X_train.clean_text)]
model = gensim.models.Word2Vec(sent_words, sg=1, vector_size =100, window=3,epochs=5,
                               min_count=3, negative=3, sample=0.001, hs=1)
model.wv.save_word2vec_format('./word2vec_model.txt', binary=False)

In [132]:
def average_word_vectors(text,model,num_features):
    vocabulary=set(w2v_model.index_to_key)
    words = word_tokenize(text)
    feature_vector=np.zeros((num_features,),dtype='float64')
    nwords=0
    for word in words:
        if word in vocabulary:
            nwords=nwords+1
            feature_vector=np.add(feature_vector,model[word])
    if nwords:
        feature_vector=np.divide(feature_vector,nwords)
    return feature_vector


def text2vector(df, w2v_model, num_features):
    # prepare vectors for train/tex=st data
    vectors = []
    for i in range(len(df)):
        val = average_word_vectors(df['clean_text'].iloc[i], w2v_model, num_features)
        vectors.append(val)
    return np.array(vectors)


w2v_model = gensim.models.KeyedVectors.load_word2vec_format('word2vec_model.txt',binary=False)
num_features = 100

In [165]:
from sklearn import svm

clf = svm.SVC().fit(text2vector(X_train, w2v_model, num_features), y_train)
y_pred = clf.predict(text2vector(X_test, w2v_model, num_features))

In [166]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print('Accuracy is {}'.format(accuracy_score(y_test, y_pred)))
confusion_matrix(y_test, y_pred)

Accuracy is 0.669449081803005


array([[74,  1,  3,  7,  1, 14],
       [ 4, 65, 10, 11,  4,  6],
       [ 5,  8, 77,  2,  5,  3],
       [14, 10,  5, 55,  4, 11],
       [ 0,  5,  9,  6, 75,  5],
       [15,  5, 11, 10,  4, 55]], dtype=int64)