In [1]:
import pandas as pd
import numpy as np
import sweetviz as sv

In [2]:
train = pd.read_csv('../data/train_ratings.csv', low_memory=False)
test  = pd.read_csv('../data/test_ratings.csv', low_memory=False)
users = pd.read_csv('../data/users.csv', low_memory=False)
books = pd.read_csv('../data/books.csv', low_memory=False)
sample_submission = pd.read_csv('../data/sample_submission.csv', low_memory=False)

In [3]:
def preprocess_book(df):
    vals_year   = df[df['year'].isin(['DK Publishing Inc','Gallimard'])]['author'].values 
    vals_author = df[df['year'].isin(['DK Publishing Inc','Gallimard'])]['year'].values 
    df.loc[df['year'].isin(['DK Publishing Inc','Gallimard']),'year'] = vals_year
    df.loc[df['year'].isin(['DK Publishing Inc','Gallimard']),'author'] = vals_author
    df['year'] = df['year'].astype(float)
    df.loc[df['year'] == 0, 'year'] = np.nan
    return df

books = preprocess_book(books)

In [4]:
train = train.merge(users, how='left').merge(books, how='left')
test = test.merge(users, how='left').merge(books, how='left')

In [5]:
# train2 = train.copy()

# for col in ['city', 'province', 'country', 'title', 'author', 'publisher']:
#     idx = np.array(train2[col].isin(test[col].unique()).loc[lambda x: ~x].index)
#     train2.drop(index=idx, inplace=True)

In [6]:
all_interactions = pd.concat([train[['user_id', 'book_id', 'rating']], test[['user_id', 'book_id']].assign(rating=0)])
sample_user_id = pd.Series(all_interactions['user_id'].unique()).sample(n=10000, random_state=4).values
all_interactions = all_interactions[all_interactions['user_id'].isin(sample_user_id)]

pivot_interactions = all_interactions.pivot_table('rating', 'user_id', 'book_id')
pivot_interactions

book_id,0000694cce,0005eca48c,00092574c6,00096fc1b4,000c4b34c1,000d218a64,000fcede56,001425d200,00197a36cd,001a8c395c,...,ffea0e8767,ffea679bfc,ffec52a88e,ffeff5d6e9,fff2518ddf,fff7436599,fff910c0a6,fff96e5833,fffb9ddc28,fffcf3cfa2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00005d011d,,,,,,,,,,,...,,,,,,,,,,
0001796186,,,,,,,,,,,...,,,,,,,,,,
00038787e6,,,,,,,,,,,...,,,,,,,,,,
0005cd3c50,,,,,,,,,,,...,,,,,,,,,,
0007f1d62c,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffe39a737d,,,,,,,,,,,...,,,,,,,,,,
fff38493f5,,,,,,,,,,,...,,,,,,,,,,
fff8db9478,,,,,,,,,,,...,,,,,,,,,,
fffe1ddcc0,,,,,,,,,,,...,,,,,,,,,,


In [9]:
all_interactions.sort_values('user_id').to_clipboard()

In [None]:
pivot_interactions

In [24]:
all_interactions

Unnamed: 0,user_id,book_id,rating
0,9db527ea34,69173ee3b6,5
1,3db2595a13,58d33fe06a,8
2,375781e597,512b5d69de,10
3,f00ee6360d,98c1419160,9
4,8b893fb104,99c56ce036,9
...,...,...,...
134302,598825e90a,29f4516f72,0
134303,0e1ff052ae,d27721acf2,0
134304,0baec5a22d,121a671e06,0
134305,967874531a,c414ce067b,0


In [7]:
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [None]:
features = ['age', 'city', 'province', 'country', 'title', 'author', 'year', 'publisher']
target = 'rating'

for col in features:
    df_train[col] = df_train[col].map(lambda x: int(x.split('_')[-1]))
    df_test[col] = df_test[col].map(lambda x: int(x.split('_')[-1]))

X = df_train[features]
y = df_train[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
train.columns

Index(['id', 'user_id', 'book_id', 'rating', 'age', 'city', 'province',
       'country', 'title', 'author', 'year', 'publisher'],
      dtype='object')

In [21]:
train

Unnamed: 0,id,user_id,book_id,rating,age,city,province,country,title,author,year,publisher
1,9ecc1d4a2e9a7476,3db2595a13,58d33fe06a,8,41.0,romney,west virginia,usa,The No. 1 Ladies' Detective Agency (Today Show...,Alexander McCall Smith,2003.0,Anchor
3,617ef107e6ffed5a,f00ee6360d,98c1419160,9,,san diego,,usa,The Summons,John Grisham,2002.0,Dell Publishing Company
4,0e2be280941c4d1f,8b893fb104,99c56ce036,9,26.0,springfield,missouri,usa,"Starman (The Axis Trilogy, Bk 3)",Sara Douglass,2002.0,Tor Books
6,efbb30ccb29583c4,3ae810fa9d,e0aa3f0fc0,9,29.0,porto,porto,portugal,Celtic Fairy Tales,Joseph Jacobs,,Parragon
7,c95da7fa58219065,0405213304,9392a59e6a,4,,irvine,california,,The Reef,Nora Roberts,1999.0,Jove Books
...,...,...,...,...,...,...,...,...,...,...,...,...
249419,6dbfef61c2fc29fa,1a8a4f0b39,402a0ed40f,9,,providence,rhode island,usa,The Practical Encycopedia of Feng Shui,Gill Hale,2001.0,Hermes House
249420,0454a4c5989ef6c6,5238703487,c13d6aa85e,7,37.0,köln,nordrhein-westfalen,germany,"Fischer TaschenbÃ?Â¼cher, Bd.26, SchÃ?Â¶ne neu...",Aldous Huxley,2002.0,"Fischer (Tb.), Frankfurt"
249421,b9a8ff87e1cd3db7,239f28d3df,20a149064d,10,42.0,janesville,california,usa,Against the Odds,Elizabeth Moon,2001.0,Baen
249424,0204e100e468d273,605e3aa44a,3d7ad0d1d2,6,,cleveland,ohio,usa,The Last Family,John R. Miller,1997.0,Bantam Books


In [None]:
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)