In [1]:
import os
import sys
import datetime as DT
import pandas as pd
import numpy as np
bookclub_events = os.path.join('../books_recommender/data/', 'bookclub_events.csv')
demographics = os.path.join('../books_recommender/data/', 'bc.demographics.csv')
meta_data_file = os.path.join('../books_recommender/data/', 'BOOKINFORMATION_META.csv')

In [3]:
print("Loading : {} ...".format(bookclub_events))
dataframe = pd.read_csv(bookclub_events,
                        parse_dates=['event_time', 'receipt_time'])

# eliminate rows with duplicate row_id
print("{:50s}   {}".format("Total No of records : ", len(dataframe)))
dataframe = dataframe.drop_duplicates(['row_id'])
print("{:50s}   {}".format("Removed Duplicate Row_ID, No of records : ",
                           len(dataframe)))

# eliminate books with null id
dataframe = dataframe[dataframe['book_code'].notnull()]
print("{:50s}   {}".format("Removed books with null id, No of Records : ",
                           len(dataframe)))

# eliminate learners with null id
dataframe = dataframe[dataframe['learner_id'].notnull()]
print("{:50s}   {}".format("Removed learners with null id, No of Records : ",
                           len(dataframe)))

# since no of open and close events are unequal, just filter on close
# events
events_filter = ((dataframe['event_name'] == 'book_close') |
                 (dataframe['event_name'] == 'video_close') |
                 (dataframe['event_name'] == 'audio_close'))
dataframe = dataframe[events_filter]
print("{:50s}   {}".format("After filtering events for close events, No of Records : ",
                           len(dataframe)))

# sort data by close event time
print("Sorting records by event time...")
dataframe.sort_values(by='event_time', inplace=True)

Loading : ../books_recommender/data/bookclub_events.csv ...


  interactivity=interactivity, compiler=compiler, result=result)


Total No of records :                                23644617
Removed Duplicate Row_ID, No of records :            23644461
Removed books with null id, No of Records :          13507961
Removed learners with null id, No of Records :       13507961
After filtering events for close events, No of Records :    5566611
Sorting records by event time...


In [4]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(dataframe['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(dataframe['book_code'].unique())))

Data       : No of learners       : 243344
Data       : No of books          : 11490


In [5]:
item_type_stats = dataframe.groupby(['learner_id', 'event_name'])\
                           .agg({'book_code' : (lambda x: len(x.unique()))})\
                           .rename(columns={'book_code' : 'no_of_items'})\
                           .reset_index()

In [8]:
item_type_stats.head()

Unnamed: 0,learner_id,event_name,no_of_items
0,5.0,book_close,2
1,12.0,book_close,10
2,12.0,video_close,7
3,17.0,book_close,2
4,17.0,video_close,7


In [9]:
total_item_type_stats = item_type_stats.groupby('learner_id')\
                                       .agg({'no_of_items' : np.sum})\
                                       .rename(columns={'no_of_items' : 'total_no_of_items'})\
                                       .reset_index()
total_item_type_stats.head()

Unnamed: 0,learner_id,total_no_of_items
0,5.0,2
1,12.0,17
2,17.0,9
3,31.0,1
4,41.0,1


In [10]:
learner_item_type_stats = item_type_stats.merge(total_item_type_stats,
                                                    on='learner_id')
learner_item_type_stats['percentage'] = learner_item_type_stats[
    'no_of_items'] / learner_item_type_stats['total_no_of_items']
learner_item_type_stats.head()

Unnamed: 0,learner_id,event_name,no_of_items,total_no_of_items,percentage
0,5.0,book_close,2,2,1.0
1,12.0,book_close,10,17,0.588235
2,12.0,video_close,7,17,0.411765
3,17.0,book_close,2,9,0.222222
4,17.0,video_close,7,9,0.777778


In [11]:
learner_books_df = dataframe.groupby(['learner_id', 'book_code'])\
                                .size()\
                                .reset_index()\
                                .rename(columns={0: 'events_count'})
learner_books_df.head()

Unnamed: 0,learner_id,book_code,events_count
0,5.0,BO-20140726013700796,2
1,5.0,BO-20140726014014719,1
2,12.0,BO-20140726013738070,1
3,12.0,BO-20140726014932178,1
4,12.0,BO-20140726015056775,1


In [12]:
first_closure_event_df = dataframe.groupby(['learner_id', 'book_code'])['event_time']\
                                      .min()\
                                      .reset_index()\
                                      .rename(columns={'event_time': 'first_access_time'})
first_closure_event_df.head()

Unnamed: 0,learner_id,book_code,first_access_time
0,5.0,BO-20140726013700796,2017-09-19 15:37:10
1,5.0,BO-20140726014014719,2017-09-16 08:37:19
2,12.0,BO-20140726013738070,2017-09-24 08:22:14
3,12.0,BO-20140726014932178,2017-09-24 08:25:50
4,12.0,BO-20140726015056775,2017-09-26 09:27:27


In [13]:
learner_books_first_closure_df = learner_books_df.merge(first_closure_event_df,
                                                            on=['learner_id', 'book_code'])
learner_books_first_closure_df.head()

Unnamed: 0,learner_id,book_code,events_count,first_access_time
0,5.0,BO-20140726013700796,2,2017-09-19 15:37:10
1,5.0,BO-20140726014014719,1,2017-09-16 08:37:19
2,12.0,BO-20140726013738070,1,2017-09-24 08:22:14
3,12.0,BO-20140726014932178,1,2017-09-24 08:25:50
4,12.0,BO-20140726015056775,1,2017-09-26 09:27:27


In [16]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_first_closure_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_first_closure_df['book_code'].unique())))

Data       : No of learners       : 243344
Data       : No of books          : 11490


In [14]:
print("Loading : {} ...".format(demographics))
# merge with demographics info
demograph = pd.read_csv(demographics,
                        parse_dates=['learner_birthday'])
now = pd.Timestamp(DT.datetime.now())
demograph['dob'] = pd.to_datetime(
    demograph['learner_birthday'], format='%m%d%y')
demograph['dob'] = demograph['dob'].where(demograph['dob'] < now,
                                          demograph['dob'] - np.timedelta64(100, 'Y'))
demograph['age'] = (now - demograph['dob']).astype('<m8[Y]')

learner_age_item_type_stats = learner_item_type_stats.merge(demograph,
                                                            on='learner_id')
learner_age_item_type_stats.head()

Loading : ../books_recommender/data/bc.demographics.csv ...


Unnamed: 0,learner_id,event_name,no_of_items,total_no_of_items,percentage,learner_birthday,learner_gender,dob,age
0,5.0,book_close,2,2,1.0,2003-06-02 15:00:00,female,2003-06-02 15:00:00,14.0
1,31.0,book_close,1,1,1.0,2005-10-18 15:00:00,male,2005-10-18 15:00:00,12.0
2,41.0,video_close,1,1,1.0,2005-02-13 15:00:00,female,2005-02-13 15:00:00,12.0
3,61.0,book_close,22,24,0.916667,1999-10-04 15:00:00,female,1999-10-04 15:00:00,18.0
4,61.0,video_close,2,24,0.083333,1999-10-04 15:00:00,female,1999-10-04 15:00:00,18.0


In [18]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_age_item_type_stats['learner_id'].unique())))
#print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_age_item_type_stats['book_code'].unique())))

Data       : No of learners       : 89519


In [20]:
all_learners = set(learner_age_item_type_stats['learner_id'].unique())
media_preference = dict()
for learner_id in all_learners:
    media_preference[learner_id] = {'book_close' : 0.0,
                                    'audio_close' : 0.0,
                                    'video_close' : 0.0,
                                    'age' : 0
                                   }
for _, row in learner_age_item_type_stats.iterrows():
    learner_id = row['learner_id']
    event_name = row['event_name']
    preference = row['percentage']
    media_preference[learner_id][event_name] = preference
    media_preference[learner_id]['age'] = row['age']

list_of_measures = []
for learner_id in media_preference:
    details = {'learner_id' : learner_id,
               'age' : media_preference[learner_id]['age'],
               'book_close' : media_preference[learner_id]['book_close'],
               'audio_close' : media_preference[learner_id]['audio_close'],
               'video_close' : media_preference[learner_id]['video_close'],
              }
    list_of_measures.append(details)
learner_measures_df = pd.DataFrame(list_of_measures)
learner_measures_df.set_index('learner_id', inplace=True)
learner_measures_df.head()

Unnamed: 0_level_0,age,audio_close,book_close,video_close
learner_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5.0,14.0,0.0,1.0,0.0
524293.0,8.0,0.0,0.473684,0.526316
262153.0,10.0,0.0,1.0,0.0
524302.0,10.0,0.0,0.285714,0.714286
262164.0,10.0,0.0,1.0,0.0


In [22]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_measures_df.index)))
#print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_age_item_type_stats['book_code'].unique())))

Data       : No of learners       : 89519


In [23]:
learner_books_info_df = pd.merge(learner_books_first_closure_df, demograph,
                                     how='inner',
                                     on='learner_id')
learner_books_info_df.head()

Unnamed: 0,learner_id,book_code,events_count,first_access_time,learner_birthday,learner_gender,dob,age
0,5.0,BO-20140726013700796,2,2017-09-19 15:37:10,2003-06-02 15:00:00,female,2003-06-02 15:00:00,14.0
1,5.0,BO-20140726014014719,1,2017-09-16 08:37:19,2003-06-02 15:00:00,female,2003-06-02 15:00:00,14.0
2,31.0,BO-20161128155737057,1,2017-09-15 10:11:00,2005-10-18 15:00:00,male,2005-10-18 15:00:00,12.0
3,41.0,BO-20170821132107285,1,2017-09-21 12:01:22,2005-02-13 15:00:00,female,2005-02-13 15:00:00,12.0
4,61.0,BO-20140726013720440,1,2017-09-21 22:36:48,1999-10-04 15:00:00,female,1999-10-04 15:00:00,18.0


In [24]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_df['book_code'].unique())))

Data       : No of learners       : 89519
Data       : No of books          : 10564


In [25]:
print("Loading : {} ...".format(meta_data_file))
metadata = pd.read_csv(meta_data_file)
learner_books_info_meta_df = pd.merge(learner_books_info_df, metadata,
                                      how='inner',
                                      left_on='book_code',
                                      right_on='BOOK_CODE')
learner_books_info_meta_df.head()

Loading : ../books_recommender/data/BOOKINFORMATION_META.csv ...


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,learner_id,book_code,events_count,first_access_time,learner_birthday,learner_gender,dob,age,Unnamed: 0_BM,BOOK_META_CODE,...,VALID_TYPE,VALID_EDTIME,MYTODAY_EXBT_CODE,JSON_DATA,SUB_BOOK_CODE,CRE_DTIME_BI,UPD_DTIME_BI,CRE_ID_BI,UPD_ID_BI,PAY_GROUP_CODE
0,5.0,BO-20140726013700796,2,2017-09-19 15:37:10,2003-06-02 15:00:00,female,2003-06-02 15:00:00,14.0,775,BM-20140726021932244,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
1,17374.0,BO-20140726013700796,3,2017-09-20 06:31:12,2006-01-26 15:00:00,female,2006-01-26 15:00:00,11.0,775,BM-20140726021932244,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
2,20350.0,BO-20140726013700796,1,2017-09-22 08:29:48,2006-06-13 15:00:00,male,2006-06-13 15:00:00,11.0,775,BM-20140726021932244,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
3,20651.0,BO-20140726013700796,1,2017-09-26 11:49:25,2003-06-04 15:00:00,male,2003-06-04 15:00:00,14.0,775,BM-20140726021932244,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
4,21795.0,BO-20140726013700796,1,2017-09-26 11:43:09,2006-11-29 15:00:00,female,2006-11-29 15:00:00,11.0,775,BM-20140726021932244,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001


In [70]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_meta_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_meta_df['book_code'].unique())))

Data       : No of learners       : 85047
Data       : No of books          : 9093


In [32]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_age_item_type_stats['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_measures_df.index)))

Data       : No of learners       : 89519
Data       : No of learners       : 89519


In [35]:
learner_books_info_meta_df['age'].describe()

count    614897.000000
mean          8.343575
std           2.503355
min           0.000000
25%           7.000000
50%           8.000000
75%           9.000000
max         118.000000
Name: age, dtype: float64

In [71]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_meta_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_meta_df['book_code'].unique())))
learner_books_info_meta_df1 = learner_books_info_meta_df[(learner_books_info_meta_df['age'] >= 5.0) & (learner_books_info_meta_df['age'] <= 20.0)]
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_meta_df1['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_meta_df1['book_code'].unique())))
user_items_df = learner_books_info_meta_df1.groupby(['learner_id'])\
                                 .agg({'book_code': 'count'})
user_items_df.rename(columns={'book_code' : 'no_of_books'}, inplace=True)
print(user_items_df['no_of_books'].describe())

min_no_of_books = 20
user_min_items_df = user_items_df[user_items_df['no_of_books'] >= min_no_of_books]
user_min_items_df.reset_index(inplace=True)
print("{:10} : {:20} : {}".format("Data", "No of learners", len(user_min_items_df['learner_id'].unique())))
#print("{:10} : {:20} : {}".format("Data", "No of books", len(user_min_items_df['book_code'].unique())))

learners = user_min_items_df['learner_id'].unique()
data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(learners)]
print()
print("{:10} : {:20} : {}".format("Data", "No of records", len(data)))
print("{:10} : {:20} : {}".format("Data", "No of learners",
                                  len(data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books",
                                  len(data['book_code'].unique())))
no_of_learners = len(learners)
test_size = 0.2
no_of_test_learners = int(no_of_learners * test_size)
#no_of_train_learners = no_of_learners - no_of_test_learners

learners_set = set(learners)
test_learners_set = set(np.random.choice(learners, no_of_test_learners, replace=False))
train_learners_set = learners_set - test_learners_set
common_learners = train_learners_set & test_learners_set
print("No of learners : {}".format(len(learners_set)))
print("No of train learners : {}".format(len(train_learners_set)))
print("No of test learners : {}".format(len(test_learners_set)))
print("No of common learners : {}".format(len(common_learners)))

test_data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(test_learners_set)]
train_data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(train_learners_set)]

common_learners = set(train_data['learner_id'].unique()) & set(test_data['learner_id'].unique())
common_books = set(train_data['book_code'].unique()) & set(test_data['book_code'].unique())

print()
print("{:10} : {:20} : {}".format("Train Data", "No of records", len(train_data)))
print("{:10} : {:20} : {}".format("Train Data", "No of learners",
                                  len(train_data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Train Data", "No of books",
                                  len(train_data['book_code'].unique())))
print()
print("{:10} : {:20} : {}".format("Test Data", "No of records", len(test_data)))
print("{:10} : {:20} : {}".format("Test Data", "No of learners",
                                  len(test_data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Test Data", "No of books",
                                  len(test_data['book_code'].unique())))
print()
print("{:10} : {:20} : {}".format("Common ", "No of learners", len(common_learners)))
print("{:10} : {:20} : {}".format("Common ", "No of books", len(common_books)))
see = train_data.groupby(['learner_id'])\
          .agg({'book_code': 'count'})\
          .rename(columns={'book_code' : 'no_of_books'})\
          .reset_index()
print(see['no_of_books'].describe())

Data       : No of learners       : 85047
Data       : No of books          : 9093
Data       : No of learners       : 82104
Data       : No of books          : 9061
count    82104.000000
mean         7.270645
std         12.519860
min          1.000000
25%          1.000000
50%          3.000000
75%          7.000000
max        322.000000
Name: no_of_books, dtype: float64
Data       : No of learners       : 6869

Data       : No of records        : 269970
Data       : No of learners       : 6869
Data       : No of books          : 8523
No of learners : 6869
No of train learners : 5496
No of test learners : 1373
No of common learners : 0

Train Data : No of records        : 216811
Train Data : No of learners       : 5496
Train Data : No of books          : 8279

Test Data  : No of records        : 53159
Test Data  : No of learners       : 1373
Test Data  : No of books          : 6105

Common     : No of learners       : 0
Common     : No of books          : 5861
count    5496.000000
me

In [74]:
learner_books_info_meta_df['events_count'].describe()

count    614897.000000
mean          1.515309
std           1.621358
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         299.000000
Name: events_count, dtype: float64

In [79]:
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_meta_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_meta_df['book_code'].unique())))

learner_books_info_min_3_events_df = learner_books_info_meta_df[
        learner_books_info_meta_df['events_count'] >= 3]
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_min_3_events_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_min_3_events_df['book_code'].unique())))

learner_books_info_meta_df1 = learner_books_info_min_3_events_df[(learner_books_info_min_3_events_df['age'] >= 5.0) & (learner_books_info_min_3_events_df['age'] <= 20.0)]
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_meta_df1['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_meta_df1['book_code'].unique())))
user_items_df = learner_books_info_meta_df1.groupby(['learner_id'])\
                                 .agg({'book_code': 'count'})
user_items_df.rename(columns={'book_code' : 'no_of_books'}, inplace=True)
print(user_items_df['no_of_books'].describe())

min_no_of_books = 20
user_min_items_df = user_items_df[user_items_df['no_of_books'] >= min_no_of_books]
user_min_items_df.reset_index(inplace=True)
print("{:10} : {:20} : {}".format("Data", "No of learners", len(user_min_items_df['learner_id'].unique())))
#print("{:10} : {:20} : {}".format("Data", "No of books", len(user_min_items_df['book_code'].unique())))

learners = user_min_items_df['learner_id'].unique()
data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(learners)]
print()
print("{:10} : {:20} : {}".format("Data", "No of records", len(data)))
print("{:10} : {:20} : {}".format("Data", "No of learners",
                                  len(data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books",
                                  len(data['book_code'].unique())))
no_of_learners = len(learners)
test_size = 0.3
no_of_test_learners = int(no_of_learners * test_size)
#no_of_train_learners = no_of_learners - no_of_test_learners

learners_set = set(learners)
test_learners_set = set(np.random.choice(learners, no_of_test_learners, replace=False))
train_learners_set = learners_set - test_learners_set
common_learners = train_learners_set & test_learners_set
print("No of learners : {}".format(len(learners_set)))
print("No of train learners : {}".format(len(train_learners_set)))
print("No of test learners : {}".format(len(test_learners_set)))
print("No of common learners : {}".format(len(common_learners)))

test_data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(test_learners_set)]
train_data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(train_learners_set)]

common_learners = set(train_data['learner_id'].unique()) & set(test_data['learner_id'].unique())
common_books = set(train_data['book_code'].unique()) & set(test_data['book_code'].unique())

print()
print("{:10} : {:20} : {}".format("Train Data", "No of records", len(train_data)))
print("{:10} : {:20} : {}".format("Train Data", "No of learners",
                                  len(train_data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Train Data", "No of books",
                                  len(train_data['book_code'].unique())))
print()
print("{:10} : {:20} : {}".format("Test Data", "No of records", len(test_data)))
print("{:10} : {:20} : {}".format("Test Data", "No of learners",
                                  len(test_data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Test Data", "No of books",
                                  len(test_data['book_code'].unique())))
print()
print("{:10} : {:20} : {}".format("Common ", "No of learners", len(common_learners)))
print("{:10} : {:20} : {}".format("Common ", "No of books", len(common_books)))
see = train_data.groupby(['learner_id'])\
          .agg({'book_code': 'count'})\
          .rename(columns={'book_code' : 'no_of_books'})\
          .reset_index()
see['no_of_books'].describe()

Data       : No of learners       : 85047
Data       : No of books          : 9093
Data       : No of learners       : 22435
Data       : No of books          : 5203
Data       : No of learners       : 21802
Data       : No of books          : 5156
count    21802.000000
mean         2.825521
std          4.817662
min          1.000000
25%          1.000000
50%          1.000000
75%          3.000000
max        135.000000
Name: no_of_books, dtype: float64
Data       : No of learners       : 326

Data       : No of records        : 10529
Data       : No of learners       : 326
Data       : No of books          : 2426
No of learners : 326
No of train learners : 229
No of test learners : 97
No of common learners : 0

Train Data : No of records        : 7199
Train Data : No of learners       : 229
Train Data : No of books          : 2139

Test Data  : No of records        : 3330
Test Data  : No of learners       : 97
Test Data  : No of books          : 1292

Common     : No of learners     

count    229.000000
mean      31.436681
std       13.708483
min       20.000000
25%       22.000000
50%       26.000000
75%       36.000000
max      135.000000
Name: no_of_books, dtype: float64

In [77]:
# current_dir = os.path.dirname(os.path.abspath(__file__))
# preprocessed_data_dir = os.path.join(current_dir, 'preprocessed_metadata')
# if not os.path.exists(preprocessed_data_dir):
#     os.makedirs(preprocessed_data_dir)
preprocessed_data_dir = '../books_recommender/preprocessed_metadata'
learner_age_item_type_file = os.path.join(preprocessed_data_dir,
                                              'learner_age_item_type_stats.csv')
learner_age_item_type_stats.to_csv(learner_age_item_type_file, index=False)
learner_measures_file = os.path.join(preprocessed_data_dir,
                                         'learner_measures.csv')
learner_measures_df.to_csv(learner_measures_file)

learner_books_info_file = os.path.join(preprocessed_data_dir,
                                       'learner_books_info_close_events.csv')
learner_books_info_meta_df.to_csv(learner_books_info_file, index=False)

learner_books_info_min_3_file = os.path.join(preprocessed_data_dir,
                                                 'learner_books_info_close_min_3_events.csv')
learner_books_info_min_3_events_df.to_csv(learner_books_info_min_3_file, index=False)
learner_books_info_min_10_file = os.path.join(preprocessed_data_dir,
                                                  'learner_books_info_close_min_10_events.csv')
learner_books_info_min_10_events_df.to_csv(learner_books_info_min_10_file, index=False)

In [78]:
learner_books_info_min_3_events_df
print("{:10} : {:20} : {}".format("Data", "No of learners",
                                  len(learner_books_info_min_3_events_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books",
                                  len(learner_books_info_min_3_events_df['book_code'].unique())))

Data       : No of learners       : 22435
Data       : No of books          : 5203


In [69]:
learner_books_info_min_10_events_df = learner_books_info_meta_df[
        learner_books_info_meta_df['events_count'] >= 10]
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_min_10_events_df['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_min_10_events_df['book_code'].unique())))


learner_books_info_meta_df1 = learner_books_info_min_10_events_df[(learner_books_info_min_10_events_df['age'] >= 5.0) & (learner_books_info_min_10_events_df['age'] <= 20.0)]
print("{:10} : {:20} : {}".format("Data", "No of learners", len(learner_books_info_meta_df1['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Data", "No of books", len(learner_books_info_meta_df1['book_code'].unique())))
user_items_df = learner_books_info_meta_df1.groupby(['learner_id'])\
                                 .agg({'book_code': 'count'})
user_items_df.rename(columns={'book_code' : 'no_of_books'}, inplace=True)
print(user_items_df['no_of_books'].describe())

min_no_of_books = 20
user_min_items_df = user_items_df[user_items_df['no_of_books'] >= min_no_of_books]
user_min_items_df.reset_index(inplace=True)
print("{:10} : {:20} : {}".format("Data", "No of learners", len(user_min_items_df['learner_id'].unique())))
#print("{:10} : {:20} : {}".format("Data", "No of books", len(user_min_items_df['book_code'].unique())))

learners = user_min_items_df['learner_id'].unique()
no_of_learners = len(learners)
test_size = 0.2
no_of_test_learners = int(no_of_learners * test_size)
#no_of_train_learners = no_of_learners - no_of_test_learners

learners_set = set(learners)
test_learners_set = set(np.random.choice(learners, no_of_test_learners, replace=False))
train_learners_set = learners_set - test_learners_set
common_learners = train_learners_set & test_learners_set
print("No of learners : {}".format(len(learners_set)))
print("No of train learners : {}".format(len(train_learners_set)))
print("No of test learners : {}".format(len(test_learners_set)))
print("No of common learners : {}".format(len(common_learners)))

test_data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(test_learners_set)]
train_data = learner_books_info_meta_df1[learner_books_info_meta_df1['learner_id'].isin(train_learners_set)]

common_learners = set(train_data['learner_id'].unique()) & set(test_data['learner_id'].unique())
common_books = set(train_data['book_code'].unique()) & set(test_data['book_code'].unique())

print()
print("{:10} : {:20} : {}".format("Train Data", "No of records", len(train_data)))
print("{:10} : {:20} : {}".format("Train Data", "No of learners",
                                  len(train_data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Train Data", "No of books",
                                  len(train_data['book_code'].unique())))
print()
print("{:10} : {:20} : {}".format("Test Data", "No of records", len(test_data)))
print("{:10} : {:20} : {}".format("Test Data", "No of learners",
                                  len(test_data['learner_id'].unique())))
print("{:10} : {:20} : {}".format("Test Data", "No of books",
                                  len(test_data['book_code'].unique())))
print()
print("{:10} : {:20} : {}".format("Common ", "No of learners", len(common_learners)))
print("{:10} : {:20} : {}".format("Common ", "No of books", len(common_books)))
see = train_data.groupby(['learner_id'])\
          .agg({'book_code': 'count'})\
          .rename(columns={'book_code' : 'no_of_books'})\
          .reset_index()
see['no_of_books'].describe()

Data       : No of learners       : 1185
Data       : No of books          : 1056
Data       : No of learners       : 1150
Data       : No of books          : 1031
count    1150.000000
mean        2.130435
std         4.394240
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        97.000000
Name: no_of_books, dtype: float64
Data       : No of learners       : 11
No of learners : 11
No of train learners : 9
No of test learners : 2
No of common learners : 0

Train Data : No of records        : 334
Train Data : No of learners       : 9
Train Data : No of books          : 220

Test Data  : No of records        : 72
Test Data  : No of learners       : 2
Test Data  : No of books          : 56

Common     : No of learners       : 0
Common     : No of books          : 33


count     9.000000
mean     37.111111
std      23.687784
min      20.000000
25%      24.000000
50%      29.000000
75%      41.000000
max      97.000000
Name: no_of_books, dtype: float64