In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# multiclass classification
import xgboost
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report, precision_score, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper, cross_val_score, features_generator, gen_features
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data_book_clean.csv')

In [3]:
df = df.drop(columns='Unnamed: 0')
df = df[df['normal'].notnull()]
df = df[df['sinopsis'].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6419 entries, 0 to 7182
Data columns (total 21 columns):
autor                  6419 non-null object
cod_autor              6419 non-null float64
cod_sello_editorial    6419 non-null int64
coleccion              6419 non-null object
contratapa             6376 non-null object
digital                2242 non-null object
fecha_nov              6419 non-null object
genero_1               6419 non-null object
idioma                 6419 non-null object
isbn                   6419 non-null object
matnr                  6419 non-null object
medidas                4146 non-null object
paginas                6171 non-null float64
portada                6419 non-null object
pvp                    6419 non-null float64
region                 6419 non-null object
sello_editorial        6419 non-null object
sinopsis               6419 non-null object
titulo                 6419 non-null object
texto                  6419 non-null object
normal     

In [4]:
df['genero_1'] = df.apply(lambda x: re.sub('ENSAYO POLITICO', 'ENSAYOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('ENSAYO RELIGIOSO', 'ENSAYOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS CHILENAS', 'NOVELAS EXTRANJERAS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS LATINOAMERICANAS', 'NOVELAS EXTRANJERAS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS EXTRANJEROS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS LATINOAMERICANOS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS ARGENTINOS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NARRATIVA BREVE', 'NARRATIVA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NARRATIVA FEMENINA', 'NARRATIVA', x['genero_1']), axis=1)

In [5]:
df.genero_1.value_counts()

ENSAYOS                   1109
NOVELAS EXTRANJERAS        967
INFANTILES                 770
NARRATIVA                  565
OBRAS DIVERSAS             484
NOVELAS ARGENTINAS         451
AUTOAYUDA                  379
BIOGRAFIAS                 265
NOVELA                     238
JUVENILES                  201
CUENTOS                    199
FICCION                    175
ENTRETENIMIENTO Y OCIO      89
HISTORIA                    81
SIN DET. ALFAGUARA          48
NO FICCION                  48
LITERATURA                  41
GUIAS                       31
POESIAS                     30
CRONICA                     25
ANTOLOGIAS                  24
HUMOR                       18
ESPIRITUALIDAD              17
AUTOBIOGRAFIA               14
MEMORIAS                    13
NEGOCIOS                    13
PERIODISMO                  13
COCINA                      12
INVESTIGACION               12
DICCIONARIO                 12
EPISTOLAR                   11
OBRAS DE REFERENCIA         11
COMICS  

In [6]:
dft01 = df.groupby('genero_1').filter(lambda x: x['genero_1'].count() > 80)
dft01.genero_1.value_counts()

ENSAYOS                   1109
NOVELAS EXTRANJERAS        967
INFANTILES                 770
NARRATIVA                  565
OBRAS DIVERSAS             484
NOVELAS ARGENTINAS         451
AUTOAYUDA                  379
BIOGRAFIAS                 265
NOVELA                     238
JUVENILES                  201
CUENTOS                    199
FICCION                    175
ENTRETENIMIENTO Y OCIO      89
HISTORIA                    81
Name: genero_1, dtype: int64

In [21]:
dft02 = dft01[['sinopsis', 'titulo', 'texto', 'normal', 'genero_1']]

In [22]:
feature_def = gen_features(
    columns=['sinopsis', 'titulo', 'normal'],
    classes=[TfidfVectorizer]
)

In [23]:
feature_def

[('sinopsis',
  [TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)]),
 ('titulo',
  [TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)]),
 ('normal',
  [TfidfVectorizer(analy

In [24]:
mapper5 = DataFrameMapper(feature_def)

In [25]:
dft05 = dft02[['sinopsis','titulo', 'normal']]

In [26]:
X = mapper5.fit_transform(dft05)

In [27]:
y = dft02['genero_1']
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)

In [28]:
y_cat = y.unique()
y_cat

array(['NOVELAS EXTRANJERAS', 'ENSAYOS', 'NOVELA', 'CUENTOS', 'FICCION',
       'JUVENILES', 'INFANTILES', 'AUTOAYUDA', 'NARRATIVA',
       'NOVELAS ARGENTINAS', 'OBRAS DIVERSAS', 'ENTRETENIMIENTO Y OCIO',
       'BIOGRAFIAS', 'HISTORIA'], dtype=object)

In [29]:
# split data into X and y
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=test_size, random_state=seed)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4778, 93487)
(1195, 93487)
(4778,)
(1195,)


In [31]:
#Model RandomForestClassifier #01     
forest01 = RandomForestClassifier(n_estimators=400, random_state=1, n_jobs=-1)
forest01.fit(X_train, y_train)
predicted01 = forest01.predict(X_test)

In [36]:
precision_score(y_test, predicted01, average='weighted')

0.7736134517298658

In [37]:
report = classification_report(y_test, predicted01)
print(report)

             precision    recall  f1-score   support

          0       0.93      0.78      0.85        67
          1       0.81      0.35      0.49        49
          2       0.80      0.59      0.68        41
          3       0.69      0.93      0.79       213
          4       1.00      0.67      0.80        12
          5       0.71      0.12      0.21        40
          6       1.00      0.37      0.54        19
          7       0.84      0.97      0.90       156
          8       1.00      0.23      0.38        47
          9       0.89      0.36      0.52       129
         10       0.80      0.30      0.43        54
         11       0.93      0.60      0.73        85
         12       0.45      0.92      0.61       195
         13       0.93      0.62      0.75        88

avg / total       0.77      0.69      0.67      1195



In [38]:
print('Accuracy: %.2f%%' % (accuracy_score(y_test, predicted01) * 100))

Accuracy: 68.79%
