In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# multiclass classification
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost.sklearn import XGBClassifier 
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report, precision_score, mean_squared_error, r2_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_validate, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn_pandas import DataFrameMapper, cross_val_score, features_generator, gen_features
from sklearn.pipeline import Pipeline
import seaborn as sns
import scipy.stats as st
import time

In [2]:
df = pd.read_csv('data_book_clean.csv')

In [3]:
df = df.drop(columns='Unnamed: 0')
df = df[df['normal'].notnull()]
df = df[df['sinopsis'].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6419 entries, 0 to 7182
Data columns (total 21 columns):
autor                  6419 non-null object
cod_autor              6419 non-null float64
cod_sello_editorial    6419 non-null int64
coleccion              6419 non-null object
contratapa             6376 non-null object
digital                2242 non-null object
fecha_nov              6419 non-null object
genero_1               6419 non-null object
idioma                 6419 non-null object
isbn                   6419 non-null object
matnr                  6419 non-null object
medidas                4146 non-null object
paginas                6171 non-null float64
portada                6419 non-null object
pvp                    6419 non-null float64
region                 6419 non-null object
sello_editorial        6419 non-null object
sinopsis               6419 non-null object
titulo                 6419 non-null object
texto                  6419 non-null object
normal     

In [4]:
df['genero_1'] = df.apply(lambda x: re.sub('ENSAYO POLITICO', 'ENSAYOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('ENSAYO RELIGIOSO', 'ENSAYOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS CHILENAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS LATINOAMERICANAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS EXTRANJEROS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS LATINOAMERICANOS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('CUENTOS ARGENTINOS', 'CUENTOS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NARRATIVA BREVE', 'NARRATIVA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NARRATIVA FEMENINA', 'NARRATIVA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS ARGENTINAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('NOVELAS EXTRANJERAS', 'NOVELA', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('ENTRETENIMIENTO Y OCIO', 'OBRAS DIVERSAS', x['genero_1']), axis=1)
df['genero_1'] = df.apply(lambda x: re.sub('HISTORIA', 'OBRAS DIVERSAS', x['genero_1']), axis=1)

In [5]:
df['paginas'].fillna(192, inplace=True)

In [6]:
# Corregimos las editoriales
df['sello_editorial'].replace('DEBOLS!LLO','DEBOLSILLO', inplace=True)
df['sello_editorial'].replace(['ALFAGUARA I.' ,'ALFAGUARA J.', 'ALFAGUARA INFANTIL JUVENIL', 'CLARIN-ALFAGUARA'], ['ALFAGUARA','ALFAGUARA','ALFAGUARA','ALFAGUARA'], inplace=True)
df['sello_editorial'].replace(['SUDAMERICANA INFANTIL JUVENIL','SUDAMERICANA/ COPPPAL','SUDAMERICANA-EUDEBA'],['SUDAMERICANA','SUDAMERICANA','SUDAMERICANA'], inplace=True)
df['sello_editorial'].replace(['LITERATURA RANDOM HOUSE','ANAGRAMA & LITERATURA RANDOM HOUSE','PENGUIN RANDOM HOUSE','PENGUIN CLÁSICOS'],['RANDOM','RANDOM','RANDOM','RANDOM'], inplace=True)
df['sello_editorial'].replace('ORIGEN KIDS','ORIGEN', inplace = True)

In [7]:
# Preprocesamos el sello para la dummy (notebook Diego)
g = df.groupby('sello_editorial')
df.loc[g['sello_editorial'].transform(lambda x: len(x) < 100).astype(bool), 'sello_editorial'] = 'OTROS SELLOS'

In [8]:
dft01 = df.groupby('genero_1').filter(lambda x: x['genero_1'].count() > 80)
dft01.genero_1.value_counts()

NOVELA            1656
ENSAYOS           1109
INFANTILES         770
OBRAS DIVERSAS     654
NARRATIVA          565
AUTOAYUDA          379
BIOGRAFIAS         265
JUVENILES          201
CUENTOS            199
FICCION            175
Name: genero_1, dtype: int64

In [9]:
#dft02 = dft01[['sinopsis', 'titulo', 'texto', 'normal', 'genero_1']]
dft02 = dft01[['sello_editorial', 'paginas', 'titulo', 'normal', 'genero_1']]

In [10]:
dft03 = pd.get_dummies(dft02, columns=['sello_editorial'])

In [11]:
mapper3 = DataFrameMapper([
    ('titulo', TfidfVectorizer()),
    ('normal', TfidfVectorizer()),
    ('paginas', None),
    ('sello_editorial_AGUILAR', None),
    ('sello_editorial_ALFAGUARA', None),
    ('sello_editorial_ALTEA', None),
    ('sello_editorial_DEBATE', None),
    ('sello_editorial_DEBOLSILLO', None),
    ('sello_editorial_GRIJALBO', None),
    ('sello_editorial_LUMEN', None),
    ('sello_editorial_MONDADORI', None),
    ('sello_editorial_MONTENA', None),
    ('sello_editorial_OTROS SELLOS', None),
    ('sello_editorial_PLAZA & JANES', None),
    ('sello_editorial_RANDOM', None),
    ('sello_editorial_ROCA EDITORIAL', None),
    ('sello_editorial_SUDAMERICANA', None),
    ('sello_editorial_SUMA DE LETRAS', None)
])

In [12]:
X_2 = mapper3.fit_transform(dft03)

In [13]:
y = dft03['genero_1']
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)

In [14]:
one_to_left = st.beta(10, 1)

In [15]:
params = {
    "n_estimators": st.randint(100,400), # Number of boosted trees to fit.
#    "n_estimators": st.randint(5,10),
    "max_depth": st.randint(3, 12),     # Maximum tree depth for base learners.
    "learning_rate": st.uniform(0.05, 0.4), #     Boosting learning rate (xgb’s “eta”)
    "colsample_bytree": one_to_left, #      Subsample ratio of columns when constructing each tree.
    "subsample": one_to_left,     # Subsample ratio of the training instance.
    "gamma": st.uniform(0, 10), #     Minimum loss reduction required to make a further partition on a leaf node of the tree.
    'reg_alpha': st.uniform(0.05,10),   # L1 regularization term on weights
    "min_child_weight": st.uniform(1,20), #    Minimum sum of instance weight(hessian) needed in a child.
}

In [16]:
# Model #03 LightGBM_normal_titulo_features_numbers
model_03 = lgb.LGBMClassifier()
lgbm_03 = RandomizedSearchCV(model_03, params, n_iter = 25, verbose=1)

In [17]:
# split data into X and y para Model02
seed = 7
test_size = 0.2
X_train03, X_test03, y_train03, y_test03 = train_test_split(X_2, encoded_y, test_size=test_size, random_state=seed)

In [18]:
print(X_train03.shape)
print(X_test03.shape)
print(y_train03.shape)
print(y_test03.shape)

(4778, 74529)
(1195, 74529)
(4778,)
(1195,)


In [19]:
start = time.time()
lgbm_03.fit(X_train03, y_train03)
end = time.time()
print("La búsqueda con LightGBM llevó: ", end - start, " segundos")

Fitting 3 folds for each of 25 candidates, totalling 75 fits


MemoryError: 