In [1]:
import pandas as pd
import numpy as np
import sweetviz as sv

In [2]:
train = pd.read_csv('../data/train_ratings.csv', low_memory=False)
test  = pd.read_csv('../data/test_ratings.csv', low_memory=False)
users = pd.read_csv('../data/users.csv', low_memory=False)
books = pd.read_csv('../data/books.csv', low_memory=False)
sample_submission = pd.read_csv('../data/sample_submission.csv', low_memory=False)

In [3]:
def preprocess_book(df):
    vals_year   = df[df['year'].isin(['DK Publishing Inc','Gallimard'])]['author'].values 
    vals_author = df[df['year'].isin(['DK Publishing Inc','Gallimard'])]['year'].values 
    df.loc[df['year'].isin(['DK Publishing Inc','Gallimard']),'year'] = vals_year
    df.loc[df['year'].isin(['DK Publishing Inc','Gallimard']),'author'] = vals_author
    df['year'] = df['year'].astype(float)
    df.loc[df['year'] == 0, 'year'] = np.nan
    return df

books = preprocess_book(books)

In [7]:
train = train.merge(users, how='left').merge(books, how='left')
test = test.merge(users, how='left').merge(books, how='left')

In [15]:
train2 = train.copy()

for col in ['city', 'province', 'country', 'title', 'author', 'publisher']:
    idx = np.array(train2[col].isin(test[col].unique()).loc[lambda x: ~x].index)
    train2.drop(index=idx, inplace=True)
    train2.reset_index(drop=True, inplace=True)

In [22]:
for col in ['city', 'province', 'country', 'title', 'author', 'publisher']:
    print(col, train2[col].nunique(), test[col].nunique())

city 6300 8448
province 746 1064
country 164 232
title 31408 65622
author 13818 32648
publisher 2825 6905


In [23]:
for col in ['city', 'province', 'country', 'title', 'author', 'publisher']:
    print(col, train[col].nunique(), test[col].nunique())

city 10658 8448
province 1401 1064
country 310 232
title 101851 65622
author 48334 32648
publisher 9421 6905


In [16]:
train2

Unnamed: 0,id,user_id,book_id,rating,age,city,province,country,title,author,year,publisher
0,9ecc1d4a2e9a7476,3db2595a13,58d33fe06a,8,41.0,romney,west virginia,usa,The No. 1 Ladies' Detective Agency (Today Show...,Alexander McCall Smith,2003.0,Anchor
1,617ef107e6ffed5a,f00ee6360d,98c1419160,9,,san diego,,usa,The Summons,John Grisham,2002.0,Dell Publishing Company
2,0e2be280941c4d1f,8b893fb104,99c56ce036,9,26.0,springfield,missouri,usa,"Starman (The Axis Trilogy, Bk 3)",Sara Douglass,2002.0,Tor Books
3,efbb30ccb29583c4,3ae810fa9d,e0aa3f0fc0,9,29.0,porto,porto,portugal,Celtic Fairy Tales,Joseph Jacobs,,Parragon
4,c95da7fa58219065,0405213304,9392a59e6a,4,,irvine,california,,The Reef,Nora Roberts,1999.0,Jove Books
...,...,...,...,...,...,...,...,...,...,...,...,...
155815,6dbfef61c2fc29fa,1a8a4f0b39,402a0ed40f,9,,providence,rhode island,usa,The Practical Encycopedia of Feng Shui,Gill Hale,2001.0,Hermes House
155816,0454a4c5989ef6c6,5238703487,c13d6aa85e,7,37.0,köln,nordrhein-westfalen,germany,"Fischer TaschenbÃ?Â¼cher, Bd.26, SchÃ?Â¶ne neu...",Aldous Huxley,2002.0,"Fischer (Tb.), Frankfurt"
155817,b9a8ff87e1cd3db7,239f28d3df,20a149064d,10,42.0,janesville,california,usa,Against the Odds,Elizabeth Moon,2001.0,Baen
155818,0204e100e468d273,605e3aa44a,3d7ad0d1d2,6,,cleveland,ohio,usa,The Last Family,John R. Miller,1997.0,Bantam Books


In [14]:
train2

Unnamed: 0,id,user_id,book_id,rating,age,city,province,country,title,author,year,publisher
1,9ecc1d4a2e9a7476,3db2595a13,58d33fe06a,8,41.0,romney,west virginia,usa,The No. 1 Ladies' Detective Agency (Today Show...,Alexander McCall Smith,2003.0,Anchor
3,617ef107e6ffed5a,f00ee6360d,98c1419160,9,,san diego,,usa,The Summons,John Grisham,2002.0,Dell Publishing Company
4,0e2be280941c4d1f,8b893fb104,99c56ce036,9,26.0,springfield,missouri,usa,"Starman (The Axis Trilogy, Bk 3)",Sara Douglass,2002.0,Tor Books
6,efbb30ccb29583c4,3ae810fa9d,e0aa3f0fc0,9,29.0,porto,porto,portugal,Celtic Fairy Tales,Joseph Jacobs,,Parragon
7,c95da7fa58219065,0405213304,9392a59e6a,4,,irvine,california,,The Reef,Nora Roberts,1999.0,Jove Books
...,...,...,...,...,...,...,...,...,...,...,...,...
249419,6dbfef61c2fc29fa,1a8a4f0b39,402a0ed40f,9,,providence,rhode island,usa,The Practical Encycopedia of Feng Shui,Gill Hale,2001.0,Hermes House
249420,0454a4c5989ef6c6,5238703487,c13d6aa85e,7,37.0,köln,nordrhein-westfalen,germany,"Fischer TaschenbÃ?Â¼cher, Bd.26, SchÃ?Â¶ne neu...",Aldous Huxley,2002.0,"Fischer (Tb.), Frankfurt"
249421,b9a8ff87e1cd3db7,239f28d3df,20a149064d,10,42.0,janesville,california,usa,Against the Odds,Elizabeth Moon,2001.0,Baen
249424,0204e100e468d273,605e3aa44a,3d7ad0d1d2,6,,cleveland,ohio,usa,The Last Family,John R. Miller,1997.0,Bantam Books


In [8]:
feat_cfg = sv.FeatureConfig(force_num=['rating'])
sv_report = sv.compare(train, test, 'rating', feat_cfg)
sv_report.show_html('train_test.html', open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report train_test.html was generated.


In [13]:
feat_cfg = sv.FeatureConfig(force_num=['rating'])
sv_report = sv.compare(train2, test, 'rating', feat_cfg)
sv_report.show_html('train2_test.html', open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report train2_test.html was generated.


In [4]:
sv_report = sv.analyze(users)
sv_report.show_html('users.html', open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report users.html was generated.


In [31]:
sv_report = sv.analyze(books)
sv_report.show_html('books.html', open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report books.html was generated.


In [6]:
sv_report = sv.analyze(sample_submission)
sv_report.show_html('sample_submission.html', open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report sample_submission.html was generated.


In [None]:
train.pivot('user_id', 'bo')

In [11]:
len(set(train['user_id']).intersection(set(test['user_id'])))

21568

In [7]:
train

Unnamed: 0,id,user_id,book_id,rating
0,5c52cc76216e68f8,9db527ea34,69173ee3b6,5
1,9ecc1d4a2e9a7476,3db2595a13,58d33fe06a,8
2,953317c1edbe1e56,375781e597,512b5d69de,10
3,617ef107e6ffed5a,f00ee6360d,98c1419160,9
4,0e2be280941c4d1f,8b893fb104,99c56ce036,9
...,...,...,...,...
249421,b9a8ff87e1cd3db7,239f28d3df,20a149064d,10
249422,255e5b6e4ca5eba4,0e49ebfe6b,ca99a3d7b9,9
249423,aba4d6f633b5dfba,0341610337,1325010f75,7
249424,0204e100e468d273,605e3aa44a,3d7ad0d1d2,6


In [8]:
test

Unnamed: 0,id,user_id,book_id
0,f38f854b31509133,8073cd1ef5,5d228c95dc
1,aadb34054d6c8a12,41d6240ef5,6bbcd07e36
2,87b031bdd906e26d,fcb9105c5c,4a88d029fa
3,371562c1c36bb8d5,942e3a5ae8,4127c00651
4,3ebb7df64a2896be,da3c9dfa9c,34de200846
...,...,...,...
134302,5ec870a616cf1532,598825e90a,29f4516f72
134303,0d6904a47e41a6c9,0e1ff052ae,d27721acf2
134304,0b0613b30831be62,0baec5a22d,121a671e06
134305,be05f60e761615a3,967874531a,c414ce067b


In [9]:
users

Unnamed: 0,user_id,age,city,province,country
0,c4ca4238a0,,nyc,new york,usa
1,c81e728d9d,18.0,stockton,california,usa
2,eccbc87e4b,,moscow,yukon territory,russia
3,a87ff679a2,17.0,porto,v.n.gaia,portugal
4,e4da3b7fbb,,farnborough,hants,united kingdom
...,...,...,...,...,...
278853,df10cd0607,,portland,oregon,usa
278854,df99f8c814,50.0,tacoma,washington,united kingdom
278855,269ca8efe0,,brampton,ontario,canada
278856,c7a289a7da,,knoxville,tennessee,usa


In [25]:
books

Unnamed: 0,book_id,title,author,year,publisher
0,8a2f2c390c,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,dce235e3f8,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,624ad92bef,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,9418f71f8f,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,a9908c598a,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271230,3f5d81ca15,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271231,97e8a7828f,From One to One Hundred,Teri Sloat,1991,Dutton Books
271232,7c672a518f,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271233,be6a2dcf58,Republic (World's Classics),Plato,1996,Oxford University Press


In [26]:
def is_int(s):
    try:
        int(s)
        return True
    except:
        return False

In [27]:
books['year'].map(is_int).loc[lambda x: ~x]

Series([], Name: year, dtype: bool)

In [23]:
books.loc[[209445, 220634, 221580]]

Unnamed: 0,book_id,title,author,year,publisher
209445,381071aedf,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...
220634,3be151c669,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...
221580,824afa3c7a,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...


In [11]:
sample_submission

Unnamed: 0,id,rating
0,f38f854b31509133,1
1,aadb34054d6c8a12,1
2,87b031bdd906e26d,1
3,371562c1c36bb8d5,1
4,3ebb7df64a2896be,1
...,...,...
134302,5ec870a616cf1532,1
134303,0d6904a47e41a6c9,1
134304,0b0613b30831be62,1
134305,be05f60e761615a3,1
