In [1]:
import os
import sys
import pandas as pd
import numpy as np
import datetime as DT

In [2]:
bookclub_events = os.path.join('data/', 'bookclub_events.csv')
dataframe = pd.read_csv(bookclub_events,
                        parse_dates=['event_time', 'receipt_time'])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
print("No of records : ", len(dataframe))
dataframe = dataframe.drop_duplicates(['row_id'])
print("Removed Duplicate Row_ID, No of records : ", len(dataframe))

# eliminate books with null id
dataframe = dataframe[dataframe['book_code'].notnull()]
print("Removed books with null id, No of Records : ", len(dataframe))

# eliminate learners with null id
dataframe = dataframe[dataframe['learner_id'].notnull()]
print("Removed learners with null id, No of Records : ", len(dataframe))

#since no of open and close events are unequal, just filter on close events
events_filter = ((dataframe['event_name'] == 'book_close') |\
                 (dataframe['event_name'] == 'video_close') |\
                 (dataframe['event_name'] == 'audio_close'))
dataframe = dataframe[events_filter]
print("After filtering events for close events, No of Records : ", len(dataframe))

No of records :  23644617
Removed Duplicate Row_ID, No of records :  23644461
Removed books with null id, No of Records :  13507961
Removed learners with null id, No of Records :  13507961
After filtering events for close events, No of Records :  5566611


In [5]:
learner_books_df = dataframe.groupby(['learner_id', 'book_code'])\
                           .size().reset_index()\
                           .rename(columns={0: 'events_count'})

In [11]:
demographics = os.path.join('data/', 'bc.demographics.csv')
demograph = pd.read_csv(demographics, 
                        parse_dates=['learner_birthday'])
now = pd.Timestamp(DT.datetime.now())
demograph['dob'] = pd.to_datetime(demograph['learner_birthday'], format='%m%d%y')
demograph['dob'] = demograph['dob'].where(demograph['dob'] < now, demograph['dob'] -  np.timedelta64(100, 'Y'))
demograph['age'] = (now - demograph['dob']).astype('<m8[Y]')

learner_books_info_df = pd.merge(learner_books_df, demograph, how='inner', on='learner_id')

In [12]:
learner_books_info_df.head()

Unnamed: 0,learner_id,book_code,events_count,learner_birthday,learner_gender,dob,age
0,5.0,BO-20140726013700796,2,2003-06-02 15:00:00,female,2003-06-02 15:00:00,14.0
1,5.0,BO-20140726014014719,1,2003-06-02 15:00:00,female,2003-06-02 15:00:00,14.0
2,31.0,BO-20161128155737057,1,2005-10-18 15:00:00,male,2005-10-18 15:00:00,12.0
3,41.0,BO-20170821132107285,1,2005-02-13 15:00:00,female,2005-02-13 15:00:00,12.0
4,61.0,BO-20140726013720440,1,1999-10-04 15:00:00,female,1999-10-04 15:00:00,18.0


In [13]:
meta_data_file = 'BOOKINFORMATION_META.csv'
metadata = pd.read_csv(meta_data_file)
learner_books_info_meta_df = pd.merge(learner_books_info_df, metadata, how='inner', left_on='book_code', right_on='BOOK_CODE')

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
learner_books_info_meta_df.head()

Unnamed: 0,learner_id,book_code,events_count,learner_birthday,learner_gender,dob,age,Unnamed: 0_BM,BOOK_META_CODE,BOOK_NAME_BM,...,VALID_TYPE,VALID_EDTIME,MYTODAY_EXBT_CODE,JSON_DATA,SUB_BOOK_CODE,CRE_DTIME_BI,UPD_DTIME_BI,CRE_ID_BI,UPD_ID_BI,PAY_GROUP_CODE
0,5.0,BO-20140726013700796,2,2003-06-02 15:00:00,female,2003-06-02 15:00:00,14.0,775,BM-20140726021932244,고려시대: 후삼국에서 고려로,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
1,17374.0,BO-20140726013700796,3,2006-01-26 15:00:00,female,2006-01-26 15:00:00,11.0,775,BM-20140726021932244,고려시대: 후삼국에서 고려로,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
2,20350.0,BO-20140726013700796,1,2006-06-13 15:00:00,male,2006-06-13 15:00:00,11.0,775,BM-20140726021932244,고려시대: 후삼국에서 고려로,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
3,20651.0,BO-20140726013700796,1,2003-06-04 15:00:00,male,2003-06-04 15:00:00,14.0,775,BM-20140726021932244,고려시대: 후삼국에서 고려로,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001
4,21795.0,BO-20140726013700796,1,2006-11-29 15:00:00,female,2006-11-29 15:00:00,10.0,775,BM-20140726021932244,고려시대: 후삼국에서 고려로,...,,,,"{""VIEWMODEINFO"":""VS"",""AUDIOBOOKTP"":""E"",""VIEWFU...",,2014-07-26 2:29:52,2017-04-05 14:38:34,BULK,ME-20160527174138665,PG001


In [15]:
learner_books_info_meta_df[learner_books_info_meta_df['book_code'] != learner_books_info_meta_df['BOOK_CODE']]

Unnamed: 0,learner_id,book_code,events_count,learner_birthday,learner_gender,dob,age,Unnamed: 0_BM,BOOK_META_CODE,BOOK_NAME_BM,...,VALID_TYPE,VALID_EDTIME,MYTODAY_EXBT_CODE,JSON_DATA,SUB_BOOK_CODE,CRE_DTIME_BI,UPD_DTIME_BI,CRE_ID_BI,UPD_ID_BI,PAY_GROUP_CODE


In [17]:
current_dir = os.path.dirname('/home/rraju/recommender_system/books_recommender')

In [18]:
current_dir = os.path.dirname(os.path.abspath(__file__))
preprocessed_data_dir = os.path.join(current_dir, 'preprocessed_metadata')
if not os.path.exists(preprocessed_data_dir):
    os.makedirs(preprocessed_data_dir)

learner_books_info_file = os.path.join(preprocessed_data_dir, 'learner_books_info_close_events.csv')
learner_books_info_meta_df.to_csv(learner_books_info_file, index=False)

learner_books_info_min_3_events_df = learner_books_info_meta_df[learner_books_info_meta_df['events_count']>=3]
learner_books_info_min_3_file = os.path.join(preprocessed_data_dir, 'learner_books_info_close_min_3_events.csv')
learner_books_info_min_3_events_df.to_csv(learner_books_info_min_3_file, index=False)

learner_books_info_min_10_events_df = learner_books_info_meta_df[learner_books_info_meta_df['events_count']>=10]
learner_books_info_min_10_file = os.path.join(preprocessed_data_dir, 'learner_books_info_close_min_10_events.csv')
learner_books_info_min_10_events_df.to_csv(learner_books_info_min_10_file, index=False)

print("Preprocessed data available in preprocessed_metadata/")

Preprocessed data available in preprocessed_metadata/


In [19]:
print(preprocessed_data_dir)

/home/rraju/recommender_system/preprocessed_metadata
