In [41]:
!pip install wget



In [65]:
import pandas as pd
import numpy as np
from scipy.stats import mode

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import wget
import pickle

In [43]:
sns.set()

In [74]:
train_data = pd.read_csv('data/train.csv')

In [45]:
train_data["book_genre"][1]

'Language|Writing|Nonfiction'

In [47]:
train_data.head()

Unnamed: 0,id,book_title,book_image_url,book_desc,book_genre,book_authors,book_format,book_pages,book_review_count,book_rating_count,book_rating
0,0,Forastera,https://images.gr-assets.com/books/1500683049l...,"Recién acabada la Segunda Guerra Mundial, una ...",Historical|Historical Fiction|Romance|Fantasy|...,Diana Gabaldon|Carmen Bordeu,Kindle Edition,768 pages,40197,668892,4.22
1,1,Writing about Magic,https://images.gr-assets.com/books/1445900480l...,Do you write fantasy fiction? This book is a r...,Language|Writing|Nonfiction,Rayne Hall,Paperback,180 pages,27,126,3.95
2,2,The Stress of Her Regard,https://images.gr-assets.com/books/1503059955l...,When Michael Crawford discovers his bride brut...,Fantasy|Horror|Paranormal|Vampires|Historical|...,Tim Powers,Paperback,470 pages,331,3626,3.79
3,3,The Horrors and Absurdities of Religion,https://images.gr-assets.com/books/1409779869l...,"A fascinating examination of ethics, religion ...",Philosophy|Religion|Nonfiction|Classics,Arthur Schopenhauer,Paperback,106 pages,28,371,3.85
4,4,Three Tales,https://images.gr-assets.com/books/1311645483l...,"First published in 1877, these three stories a...",Fiction|Short Stories|Classics|Cultural|France...,Gustave Flaubert|Roger Whitehouse|Geoffrey Wall,Paperback,110 pages,250,4331,3.72


In [None]:
train_data.info()

## book_desc

In [None]:
train_data[train_data['book_image_url'].isna()]['book_rating'].value_counts()

In [None]:
train_data.loc[train_data['book_rating'] < 2,'book_image_url']

In [None]:
def get_pixel(url):
    img = plt.imread(wget.download(url))
    return mode(img.flatten())[0][0]

In [None]:
train_data['book_pixel'] = train_data['book_image_url'].map(get_pixel)

In [78]:
train_data['book_genre'].mode()

0    Fiction
dtype: object

In [79]:
train_data['book_genre'] = train_data['book_genre'].fillna("Fiction")

In [80]:
def get_dict_authors():
    d = {}
    for line in train_data['book_authors'].values:
        for author in np.unique(line.split('|')):
            d.setdefault(author, 0)
            d[author] += 1
    
    return d

dict_authors = get_dict_authors()

In [81]:
def get_dict_genres():
    d = {}
    for line in train_data['book_genre'].values:
        for genre in np.unique(line.split('|')):
            d.setdefault(genre, 0)
            d[genre] += 1
    
    return d

dict_genres = get_dict_genres()

In [91]:
pickle.dump(dict_authors, open('../app/models/author_dict.pickle', 'wb'))
pickle.dump(dict_genres, open('../app/models/genre_dict.pickle', 'wb'))



In [82]:
def get_book_count(s):
    arr = []
    for author in np.unique(s.split('|')):
        arr.append(dict_authors.get(author, 0))
    
    return max(arr)
        
train_data['book_count'] = train_data['book_authors'].map(get_book_count)

In [83]:
def get_genre_count(s):
    arr = []
    for genre in np.unique(s.split('|')):
        arr.append(dict_genres.get(genre, 0))
    
    return max(arr)
        
train_data['genre_count'] = train_data['book_genre'].map(get_genre_count)

In [None]:
train_data['book_authors'] = train_data['book_authors'].map(lambda s: '|'.join(np.unique(s.split('|'))))
train_data['book_count'] = train_data.groupby(by='book_authors')['book_title'].transform('count')
# train_data['book_count'] = train_data['book_count'].map(lambda s: 3 if s > 3 else s)

In [84]:
train_data['book_pages'] = pd.to_numeric(train_data['book_pages'].str.extract(r'(\d+)', expand=False), errors='coerce')
train_data['book_pages'] = train_data['book_pages'].fillna(320)

In [None]:
train_data['book_pages'].value_counts()

In [85]:
X = train_data.drop(columns='book_rating')
y = train_data['book_rating']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [86]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=1.0, min_df=0.1)
X_train_vectorized = vectorizer.fit_transform(X_train['book_desc'])
X_valid_vectorized = vectorizer.transform(X_valid['book_desc'])

In [53]:
#pickle.dump(vectorizer, open('../app/models/tf_idf.pickle', 'wb'))
#vectorizer = pickle.load(open('../app/models/tf_idf.pickle', 'rb'))
#vectorizer.transform(X_valid['book_desc']).toarray()

array([[0.        , 0.08420589, 0.12655557, ..., 0.        , 0.08420589,
        0.        ],
       [0.        , 0.        , 0.08904286, ..., 0.        , 0.        ,
        0.        ],
       [0.07548533, 0.        , 0.        , ..., 0.        , 0.08555758,
        0.        ],
       ...,
       [0.        , 0.        , 0.17988032, ..., 0.        , 0.        ,
        0.        ],
       [0.18186292, 0.10306474, 0.07744955, ..., 0.        , 0.        ,
        0.        ],
       [0.11770541, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [87]:
train = np.concatenate([X_train_vectorized.toarray(), X_train[['book_pages','book_rating_count','book_count','genre_count']]], axis=1)
valid = np.concatenate([X_valid_vectorized.toarray(), X_valid[['book_pages','book_rating_count','book_count','genre_count']]], axis=1)

## model

In [None]:
# preprocessor = ColumnTransformer(transformers=[('tf-idf', TfidfVectorizer(ngram_range=(1, 1), max_df=1.0, min_df=0.4), ['book_desc']),
#                                                ('ohe', OneHotEncoder(), ['book_count'])])

# clf_xgb = Pipeline(steps=[('preprocessor', preprocessor),
#                           ('classifier', XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, n_jobs=-1))])

In [88]:
# clf_xgb = XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, n_jobs=-1)
# clf_xgb.fit(X_train_vectorized , y_train)
# y_pred = clf_xgb.predict(X_valid_vectorized )

clf_xgb = XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, n_jobs=-1)
clf_xgb.fit(train, y_train)
y_pred = clf_xgb.predict(valid )

In [92]:
pickle.dump(clf_xgb, open('../app/models/model.pickle', 'wb'))

In [89]:
mean_squared_error(y_valid, y_pred, squared=False)

0.3017615849903846

In [90]:
mean_squared_error(y_valid, y_pred, squared=False)

0.3017615849903846

In [None]:
np.sqrt(mean_squared_error(y_valid, y_pred))