In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# multiclass classification
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost.sklearn import XGBClassifier 
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report, precision_score, mean_squared_error, r2_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_validate, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn_pandas import DataFrameMapper, cross_val_score, features_generator, gen_features
from sklearn.pipeline import Pipeline
import seaborn as sns
import scipy.stats as st
import time

In [2]:
df = pd.read_csv('data_book_clean.csv')

In [3]:
df = df.drop(columns='Unnamed: 0')
df = df[df['normal'].notnull()]
df = df[df['sinopsis'].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6419 entries, 0 to 7182
Data columns (total 21 columns):
autor                  6419 non-null object
cod_autor              6419 non-null float64
cod_sello_editorial    6419 non-null int64
coleccion              6419 non-null object
contratapa             6376 non-null object
digital                2242 non-null object
fecha_nov              6419 non-null object
genero_1               6419 non-null object
idioma                 6419 non-null object
isbn                   6419 non-null object
matnr                  6419 non-null object
medidas                4146 non-null object
paginas                6171 non-null float64
portada                6419 non-null object
pvp                    6419 non-null float64
region                 6419 non-null object
sello_editorial        6419 non-null object
sinopsis               6419 non-null object
titulo                 6419 non-null object
texto                  6419 non-null object
normal     

In [4]:
df['genero_1'] = df.apply(lambda x: re.sub('ENSAYO POLITICO', 'ENSAYOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('ENSAYO RELIGIOSO', 'ENSAYOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS CHILENAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS LATINOAMERICANAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS EXTRANJEROS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS LATINOAMERICANOS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS ARGENTINOS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NARRATIVA BREVE', 'NARRATIVA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NARRATIVA FEMENINA', 'NARRATIVA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS ARGENTINAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS EXTRANJERAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('ENTRETENIMIENTO Y OCIO', 'OBRAS DIVERSAS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('HISTORIA', 'OBRAS DIVERSAS', x['genero_1']), axis=1)

In [5]:
df.sample(2)

Unnamed: 0,autor,cod_autor,cod_sello_editorial,coleccion,contratapa,digital,fecha_nov,genero_1,idioma,isbn,...,medidas,paginas,portada,pvp,region,sello_editorial,sinopsis,titulo,texto,normal
5052,John Katzenbach,34820.0,200,LA TRAMA,"«Feliz aniversario, doctor. Bienvenido al prim...",,1970-01-01 00:00:00.020090909,FICCION,ES,9788466642095,...,15.0 X 23.0 X 3.5,528.0,http://static.megustaleer.com.ar/images/libros...,529.0,AR,EDICIONES B,John Katzenbach nos ofrece una novela emblemát...,El Psicoanalista,"Feliz aniversario, doctor. Bienvenido al prime...",feliz aniversario doctor bienvenido primer día...
4281,E.L. James,33995.0,34,FICCION,E.L. James nos ofrece una nueva perspectiva de...,,1970-01-01 00:00:00.020150801,NOVELA,ES,9789502808260,...,15 X 23,648.0,http://static.megustaleer.com.ar/images/libros...,499.0,AR,GRIJALBO,Descubre el mundo de Cincuenta sombras de Grey...,Grey,E.L. James nos ofrece una nueva perspectiva de...,el james ofrece nueva perspectiva historia amo...


In [6]:
# Reemplazamos los null de las paginas por la moda
df.paginas.mode()

0    192.0
dtype: float64

In [7]:
df['paginas'].fillna(192, inplace=True)

In [8]:
# Corregimos las editoriales
df['sello_editorial'].replace('DEBOLS!LLO','DEBOLSILLO', inplace=True)
df['sello_editorial'].replace(['ALFAGUARA I.' ,'ALFAGUARA J.', 'ALFAGUARA INFANTIL JUVENIL', 'CLARIN-ALFAGUARA'], ['ALFAGUARA','ALFAGUARA','ALFAGUARA','ALFAGUARA'], inplace=True)
df['sello_editorial'].replace(['SUDAMERICANA INFANTIL JUVENIL','SUDAMERICANA/ COPPPAL','SUDAMERICANA-EUDEBA'],['SUDAMERICANA','SUDAMERICANA','SUDAMERICANA'], inplace=True)
df['sello_editorial'].replace(['LITERATURA RANDOM HOUSE','ANAGRAMA & LITERATURA RANDOM HOUSE','PENGUIN RANDOM HOUSE','PENGUIN CLÁSICOS'],['RANDOM','RANDOM','RANDOM','RANDOM'], inplace=True)
df['sello_editorial'].replace('ORIGEN KIDS','ORIGEN', inplace = True)

In [9]:
# Preprocesamos el sello para la dummy (notebook Diego)
g = df.groupby('sello_editorial')
df.loc[g['sello_editorial'].transform(lambda x: len(x) < 100).astype(bool), 'sello_editorial'] = 'OTROS SELLOS'

In [10]:
df.genero_1.value_counts()

NOVELA                 1656
ENSAYOS                1109
INFANTILES              770
OBRAS DIVERSAS          654
NARRATIVA               565
AUTOAYUDA               379
BIOGRAFIAS              265
JUVENILES               201
CUENTOS                 199
FICCION                 175
NO FICCION               48
SIN DET. ALFAGUARA       48
LITERATURA               41
GUIAS                    31
POESIAS                  30
CRONICA                  25
ANTOLOGIAS               24
HUMOR                    18
ESPIRITUALIDAD           17
AUTOBIOGRAFIA            14
NEGOCIOS                 13
PERIODISMO               13
MEMORIAS                 13
COCINA                   12
INVESTIGACION            12
DICCIONARIO              12
EPISTOLAR                11
OBRAS DE REFERENCIA      11
COMICS                   10
SIN DETERMINAR            6
LIBROS ILUSTRADOS         5
TEATRO                    5
FOTOGRAFIAS               4
ALBUMES                   3
EDUCACION                 3
ENTREVISTAS         

In [11]:
dft01 = df.groupby('genero_1').filter(lambda x: x['genero_1'].count() > 80)
dft01.genero_1.value_counts()

NOVELA            1656
ENSAYOS           1109
INFANTILES         770
OBRAS DIVERSAS     654
NARRATIVA          565
AUTOAYUDA          379
BIOGRAFIAS         265
JUVENILES          201
CUENTOS            199
FICCION            175
Name: genero_1, dtype: int64

In [12]:
dft01.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 0 to 7182
Data columns (total 21 columns):
autor                  5973 non-null object
cod_autor              5973 non-null float64
cod_sello_editorial    5973 non-null int64
coleccion              5973 non-null object
contratapa             5935 non-null object
digital                2111 non-null object
fecha_nov              5973 non-null object
genero_1               5973 non-null object
idioma                 5973 non-null object
isbn                   5973 non-null object
matnr                  5973 non-null object
medidas                3841 non-null object
paginas                5973 non-null float64
portada                5973 non-null object
pvp                    5973 non-null float64
region                 5973 non-null object
sello_editorial        5973 non-null object
sinopsis               5973 non-null object
titulo                 5973 non-null object
texto                  5973 non-null object
normal     

In [13]:
#dft02 = dft01[['sinopsis', 'titulo', 'texto', 'normal', 'genero_1']]
dft02 = dft01[['sello_editorial', 'paginas', 'titulo', 'normal', 'genero_1']]

In [14]:
dft04 = dft02[['sello_editorial', 'paginas', 'titulo', 'normal']]

In [15]:
dft03 = pd.get_dummies(dft04,columns=['sello_editorial'])

In [16]:
dft03.sample(2)

Unnamed: 0,paginas,titulo,normal,sello_editorial_AGUILAR,sello_editorial_ALFAGUARA,sello_editorial_ALTEA,sello_editorial_DEBATE,sello_editorial_DEBOLSILLO,sello_editorial_GRIJALBO,sello_editorial_LUMEN,sello_editorial_MONDADORI,sello_editorial_MONTENA,sello_editorial_OTROS SELLOS,sello_editorial_PLAZA & JANES,sello_editorial_RANDOM,sello_editorial_ROCA EDITORIAL,sello_editorial_SUDAMERICANA,sello_editorial_SUMA DE LETRAS
475,222.0,Doce cuentos peregrinos,esfuerzo escribir cuento corto tan intenso emp...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1131,272.0,El examen,buenos aire viene ver hueso dijo anoche llegó ...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
dft03.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 0 to 7182
Data columns (total 18 columns):
paginas                           5973 non-null float64
titulo                            5973 non-null object
normal                            5973 non-null object
sello_editorial_AGUILAR           5973 non-null uint8
sello_editorial_ALFAGUARA         5973 non-null uint8
sello_editorial_ALTEA             5973 non-null uint8
sello_editorial_DEBATE            5973 non-null uint8
sello_editorial_DEBOLSILLO        5973 non-null uint8
sello_editorial_GRIJALBO          5973 non-null uint8
sello_editorial_LUMEN             5973 non-null uint8
sello_editorial_MONDADORI         5973 non-null uint8
sello_editorial_MONTENA           5973 non-null uint8
sello_editorial_OTROS SELLOS      5973 non-null uint8
sello_editorial_PLAZA & JANES     5973 non-null uint8
sello_editorial_RANDOM            5973 non-null uint8
sello_editorial_ROCA EDITORIAL    5973 non-null uint8
sello_editorial_SUDAMERIC

In [23]:
dft06 = dft03[['paginas', 'sello_editorial_AGUILAR', 'sello_editorial_ALFAGUARA', 'sello_editorial_ALTEA',
               'sello_editorial_DEBATE','sello_editorial_DEBOLSILLO', 'sello_editorial_GRIJALBO', 
               'sello_editorial_LUMEN', 'sello_editorial_MONDADORI', 'sello_editorial_MONTENA',
               'sello_editorial_OTROS SELLOS', 'sello_editorial_PLAZA & JANES', 'sello_editorial_RANDOM',
               'sello_editorial_ROCA EDITORIAL', 'sello_editorial_SUDAMERICANA', 'sello_editorial_SUMA DE LETRAS']]

In [26]:
X_numbers = dft06.values

In [29]:
X_3 = np.concatenate((X_1, X_numbers), axis=1) 

In [18]:
feature_def = gen_features(
    columns=['titulo', 'normal'],
    classes=[TfidfVectorizer]
)

In [None]:
mapper3 = DataFrameMapper([
    ('titulo', TfidfVectorizer()),
    ('normal', TfidfVectorizer()),
    ('paginas', None),
    ('sello_editorial_AGUILAR', None),
    ('sello_editorial_ALFAGUARA', None),
    ('sello_editorial_ALTEA', None),
    ('sello_editorial_DEBATE', None),
    ('sello_editorial_DEBOLSILLO', None),
    ('sello_editorial_GRIJALBO', None),
    ('sello_editorial_LUMEN', None),
    ('sello_editorial_MONDADORI', None),
    ('sello_editorial_MONTENA', None),
    ('sello_editorial_OTROS SELLOS', None),
    ('sello_editorial_PLAZA & JANES', None),
    ('sello_editorial_RANDOM', None),
    ('sello_editorial_ROCA EDITORIAL', None),
    ('sello_editorial_SUDAMERICANA', None),
    ('sello_editorial_SUMA DE LETRAS', None)
])

In [19]:
feature_def

[('titulo',
  [TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)]),
 ('normal',
  [TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)])]

In [20]:
mapper5 = DataFrameMapper(feature_def)

In [21]:
dft05 = dft03[['titulo', 'normal']]

In [22]:
X_1 = mapper5.fit_transform(dft05)

In [None]:
X_2 = mapper3.fit_transform(dft03)

In [30]:
feature = dft02['normal']
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 3))
f = vectorizer.fit_transform(feature)
X = f

In [31]:
y = dft02['genero_1']
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)

In [32]:
y_cat = y.unique()
y_cat

array(['NOVELA', 'ENSAYOS', 'CUENTOS', 'FICCION', 'JUVENILES',
       'INFANTILES', 'AUTOAYUDA', 'NARRATIVA', 'OBRAS DIVERSAS',
       'BIOGRAFIAS'], dtype=object)

In [None]:
# split data into X and y
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=test_size, random_state=seed)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [34]:
one_to_left = st.beta(10, 1)

In [35]:
params = {
    "n_estimators": st.randint(100,400), # Number of boosted trees to fit.
#    "n_estimators": st.randint(5,10),
    "max_depth": st.randint(3, 12),     # Maximum tree depth for base learners.
    "learning_rate": st.uniform(0.05, 0.4), #     Boosting learning rate (xgb’s “eta”)
    "colsample_bytree": one_to_left, #      Subsample ratio of columns when constructing each tree.
    "subsample": one_to_left,     # Subsample ratio of the training instance.
    "gamma": st.uniform(0, 10), #     Minimum loss reduction required to make a further partition on a leaf node of the tree.
    'reg_alpha': st.uniform(0.05,10),   # L1 regularization term on weights
    "min_child_weight": st.uniform(1,20), #    Minimum sum of instance weight(hessian) needed in a child.
}

In [None]:
# Model #01 LightGBM_normal
model_01 = lgb.LGBMClassifier()
lgbm = RandomizedSearchCV(model_01, params, n_iter = 25, verbose=1)

In [None]:
start = time.time()
lgbm.fit(X_train, y_train)
end = time.time()
print("La búsqueda con LightGBM llevó: ", end - start, " segundos")

In [None]:
print(lgbm)

In [None]:
opt_lgbm = lgbm.best_estimator_
opt_lgbm

In [None]:
predicted01 = lgbm.predict(X_test)

In [None]:
precision_score(y_test, predicted01, average='weighted')

In [None]:
report = classification_report(y_test, predicted01)
print(report)

In [None]:
print('Accuracy: %.2f%%' % (accuracy_score(y_test, predicted01) * 100))

In [None]:
cats = y.unique().tolist()

In [None]:
print(classification_report(y_test, predicted01, target_names=cats))

In [None]:
mat = confusion_matrix(y_test, predicted01)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=cats,
            yticklabels=cats)
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
# Model #02 LightGBM_normal_titulo
model_02 = lgb.LGBMClassifier()
lgbm_02 = RandomizedSearchCV(model_02, params, n_iter = 25, verbose=1)

In [None]:
# split data into X and y para Model02
seed = 7
test_size = 0.2
X_train02, X_test02, y_train02, y_test02 = train_test_split(X_1, encoded_y, test_size=test_size, random_state=seed)

In [None]:
start = time.time()
lgbm_02.fit(X_train02, y_train02)
end = time.time()
print("La búsqueda con LightGBM llevó: ", end - start, " segundos")

In [None]:
opt_lgbm_02 = lgbm_02.best_estimator_
opt_lgbm_02

In [None]:
predicted02 = lgbm_02.predict(X_test02)

In [None]:
precision_score(y_test02, predicted02, average='weighted')

In [None]:
report_02 = classification_report(y_test02, predicted02)
print(report_02)

In [None]:
print('Accuracy: %.2f%%' % (accuracy_score(y_test02, predicted02) * 100))

In [None]:
print('Accuracy: %.2f%%' % (accuracy_score(y_test02, predicted02) * 100))

In [None]:
print(classification_report(y_test02, predicted02, target_names=cats))

In [None]:
mat = confusion_matrix(y_test02, predicted02)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=cats,
            yticklabels=cats)
plt.xlabel('true label')
plt.ylabel('predicted label');

In [36]:
# Model #03 LightGBM_normal_titulo_features_numbers
model_03 = lgb.LGBMClassifier()
lgbm_03 = RandomizedSearchCV(model_03, params, n_iter = 25, verbose=1)

In [38]:
# split data into X and y para Model02
seed = 7
test_size = 0.2
X_train03, X_test03, y_train03, y_test03 = train_test_split(X_3, encoded_y, test_size=test_size, random_state=seed)

MemoryError: 

In [None]:
print(X_train03.shape)
print(X_test03.shape)
print(y_train03.shape)
print(y_test03.shape)

In [None]:
start = time.time()
lgbm_03.fit(X_train03, y_train03)
end = time.time()
print("La búsqueda con LightGBM llevó: ", end - start, " segundos")