**Modules**

In [1]:
import pandas as pd
import joblib

**Reading mapping file - dictionary**

In [2]:
csv_book_mapping = joblib.load('../3_ProcessedData/csvid_to_bookid_mapper_RE_v1.joblib')

In [3]:
csv_book_mapping[0]

34684622

In [4]:
csv_book_mapping[4]

30422361

In [5]:
csv_book_mapping.get(4)

30422361

In [6]:
print(csv_book_mapping.get("csv_id"))

None


**Reading books data - Only the `book_id` column**

In [7]:
req_book_ids = pd.read_parquet('../3_ProcessedData/books_SE_v3.parquet', columns=["book_id"])

In [8]:
req_book_ids.shape

(2113033, 1)

In [9]:
req_book_ids.columns

Index(['book_id'], dtype='object')

In [10]:
req_book_ids.dtypes

book_id    object
dtype: object

**Updating the datatype of the `book_id` column**

In [11]:
req_book_ids["book_id"] = req_book_ids["book_id"].astype('Int64')

In [12]:
req_book_ids.dtypes

book_id    Int64
dtype: object

**Number of unique values should be equal to number of records**

In [13]:
req_book_ids.nunique()

book_id    2113033
dtype: int64

**Negative value checking**

In [14]:
req_book_ids[req_book_ids["book_id"] < 1] 

Unnamed: 0,book_id


**Checking min and max `book_id` values**

In [15]:
req_book_ids["book_id"].min(),req_book_ids["book_id"].max()

(1, 36530431)

**Updating column name**

In [16]:
req_book_ids.rename(columns={"book_id":"mapped_book_id"}, inplace=True)

In [17]:
req_book_ids.head()

Unnamed: 0,mapped_book_id
0,5333265
1,1333909
2,7327624
3,6066819
4,287140


**Reading the `goodreads_interactions` file, one column at a time**

In [18]:
with open("../Initial/goodreads_interactions.csv", 'r') as f:
    for i in range(10):
        print(f.readline(), end="")

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


**Reading `book_id` column**

In [20]:
interaction_book_id = pd.read_csv("../Initial/goodreads_interactions.csv", usecols=['book_id'])

In [21]:
interaction_book_id.shape

(228648342, 1)

In [22]:
interaction_book_id.head()

Unnamed: 0,book_id
0,948
1,947
2,946
3,945
4,944


In [23]:
interaction_book_id.tail()

Unnamed: 0,book_id
228648337,24772
228648338,23847
228648339,23950
228648340,374106
228648341,351607


In [24]:
interaction_book_id.dtypes

book_id    int64
dtype: object

**Mapping `csv_id` into `book_id` using the mapping file**

In [25]:
interaction_book_id["mapped_book_id"] = interaction_book_id["book_id"].map(csv_book_mapping)

In [26]:
interaction_book_id.head()

Unnamed: 0,book_id,mapped_book_id
0,948,12
1,947,21
2,946,30
3,945,45
4,944,1


In [27]:
interaction_book_id.tail()

Unnamed: 0,book_id,mapped_book_id
228648337,24772,17191046
228648338,23847,19042398
228648339,23950,20317106
228648340,374106,10024429
228648341,351607,24279189


**Crosschecking the mapped values**

In [28]:
csv_book_mapping.get(948),csv_book_mapping.get(946),csv_book_mapping.get(24772),csv_book_mapping.get(374106)

(12, 30, 17191046, 10024429)

**Preserving the index by creating a separate index column**

In [29]:
interaction_book_id = interaction_book_id.reset_index(drop=False)

In [30]:
interaction_book_id.head()

Unnamed: 0,index,book_id,mapped_book_id
0,0,948,12
1,1,947,21
2,2,946,30
3,3,945,45
4,4,944,1


In [31]:
interaction_book_id.tail()

Unnamed: 0,index,book_id,mapped_book_id
228648337,228648337,24772,17191046
228648338,228648338,23847,19042398
228648339,228648339,23950,20317106
228648340,228648340,374106,10024429
228648341,228648341,351607,24279189


**Exporting the `mapped_book_id`**

In [32]:
interaction_book_id.to_parquet('../3_ProcessedData/interaction_RE_v1.parquet', index=True, compression="snappy")