In [1]:
import pandas as pd
import re

In [2]:
items = pd.read_parquet('../3_ProcessedData/books_SE_v1.parquet')

### Exploratory Data Analysis - Search Engine

In [3]:
items.shape

(2360655, 18)

In [4]:
items.head()

Unnamed: 0,isbn,country_code,language_code,average_rating,description,link,publisher,num_pages,publication_day,isbn13,publication_month,publication_year,url,image_url,book_id,ratings_count,title,title_without_series
0,312853122.0,US,,4.0,,https://www.goodreads.com/book/show/5333265-w-...,St. Martin's Press,256.0,1.0,9780312853129.0,9.0,1984.0,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,743509986.0,US,,3.23,"Anita Diamant's international bestseller ""The ...",https://www.goodreads.com/book/show/1333909.Go...,Simon & Schuster Audio,,1.0,9780743509985.0,10.0,2001.0,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10,Good Harbor,Good Harbor
2,,US,eng,4.03,Omnibus book club edition containing the Ladie...,https://www.goodreads.com/book/show/7327624-th...,"Nelson Doubleday, Inc.",600.0,,,,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,743294297.0,US,eng,3.49,Addie Downs and Valerie Adler were eight when ...,https://www.goodreads.com/book/show/6066819-be...,Atria Books,368.0,14.0,9780743294294.0,7.0,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,Best Friends Forever,Best Friends Forever
4,850308712.0,US,,3.4,,https://www.goodreads.com/book/show/287140.Run...,,,,9780850308716.0,,,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,287140,15,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...


In [8]:
items["book_id"].unique().shape

(2360655,)

- All books have unique book id in the dataset

In [5]:
items["country_code"].unique().shape

(2,)

In [6]:
items["country_code"].value_counts()

country_code
US    2360165
          490
Name: count, dtype: int64

In [7]:
items["language_code"].unique().shape

(227,)

In [8]:
items["language_code"].value_counts()

language_code
         1060153
eng       708457
en-US      91452
en-GB      58358
spa        54524
          ...   
hat            1
ltz            1
btk            1
sla            1
lao            1
Name: count, Length: 227, dtype: int64

**Function to check if a string contains non-English characters**

In [10]:
def has_non_english_chars(text):
    # Using regular expression to match non-English characters
    return bool(re.search('[^\x00-\x7F]', text))

In [12]:
# Applying the function to the DataFrame
items['has_non_english_chars'] = items['title_without_series'].apply(has_non_english_chars)

**Only English vs Non-English/Mixed count**

In [22]:
print(items.shape[0])
print(items.loc[items["has_non_english_chars"] == True].shape[0])
print(items.loc[items["has_non_english_chars"] == False].shape[0])

2360655
247600
2113055


**Create separate DF for only English**

In [25]:
eng_items = items.loc[items["has_non_english_chars"] == False, ["book_id","title_without_series", "title", "language_code"]]

In [26]:
eng_items.head()

Unnamed: 0,book_id,title_without_series,title,language_code
0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,
1,1333909,Good Harbor,Good Harbor,
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng
3,6066819,Best Friends Forever,Best Friends Forever,eng
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,


In [27]:
len(eng_items)

2113055

**Exporting only English DF**

In [29]:
eng_items.to_parquet('../3_ProcessedData/books_SE_v2.parquet', index=True, compression="snappy")

### EDA - Only English DF

In [2]:
eng_items = pd.read_parquet('../3_ProcessedData/books_SE_v2.parquet')

In [17]:
eng_items.shape[0]

2113055

In [3]:
eng_items.head()

Unnamed: 0,book_id,title_without_series,title,language_code
0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,
1,1333909,Good Harbor,Good Harbor,
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng
3,6066819,Best Friends Forever,Best Friends Forever,eng
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,


In [4]:
eng_items["language_code"].value_counts()

language_code
         1026251
eng       702536
en-US      91021
en-GB      58120
ita        46313
          ...   
tup            1
vai            1
vec            1
yid            1
lao            1
Name: count, Length: 189, dtype: int64

#### Creating Modified Title to Redcue Search Space

**Removing characters apart from A-z, a-z and 0-9 and single space**

In [9]:
eng_items["mod_title"] = eng_items["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
eng_items["mod_title_without_series"] = eng_items["title_without_series"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

**Make `mod_title` lower case**

In [10]:
eng_items["mod_title"] = eng_items["mod_title"].str.lower()
eng_items["mod_title_without_series"] = eng_items["mod_title_without_series"].str.lower()

**Replacing any extra spaces with a single space**

In [11]:
eng_items["mod_title"] = eng_items["mod_title"].str.replace("\s+", " ", regex=True)
eng_items["mod_title_without_series"] = eng_items["mod_title_without_series"].str.replace("\s+", " ", regex=True)

In [12]:
eng_items.head()

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,,wc fields a life on film,wc fields a life on film
1,1333909,Good Harbor,Good Harbor,,good harbor,good harbor
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng,the unschooled wizard sun wolf and starhawk 12,the unschooled wizard sun wolf and starhawk 12
3,6066819,Best Friends Forever,Best Friends Forever,eng,best friends forever,best friends forever
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,,runic astrology starcraft and timekeeping in t...,runic astrology starcraft and timekeeping in t...


**Looking for mod titles < 1**

In [15]:
eng_items.loc[eng_items["mod_title"].str.len() < 1]

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
347481,12093145,-,-,,,
381663,9151153,&,&,,,
447622,21896313,--------,--------,ara,,
498824,22744516,..,..,eng,,
768922,18071040,******,******,en-US,,
896489,2433394,,,,,
908029,35481140,?,?,,,
949431,18869234,;,;,eng,,
967041,6046733,,,,,
995664,20428529,;,;,,,


In [16]:
eng_items.loc[eng_items["mod_title"].str.len() < 1, "book_id"].values

array(['12093145', '9151153', '21896313', '22744516', '18071040',
       '2433394', '35481140', '18869234', '6046733', '20428529',
       '5756291', '29522179', '23280835', '25587914', '10192004',
       '7997274', '24936849', '67415', '25423066', '30337448', '15845606',
       '7807037'], dtype=object)

In [18]:
eng_items.loc[eng_items["mod_title"].str.len() < 1, "book_id"].shape[0]

22

- The above list of `book_id` does not have any valid english characters as `title` or `title_without_series`
- So, the modified titles became 0

**Dropping the books with mod title less than 1**

In [19]:
eng_items = eng_items.loc[eng_items["mod_title"].str.len() > 0]

In [20]:
eng_items.shape[0]

2113033

**Looking for mod titles < 2**

In [48]:
eng_items.loc[eng_items["mod_title"].str.len() < 2].sample(10)

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1063899,6496901,U!,U!,ind,u,u
1588952,9711968,Q,Q,,q,q
367661,32790971,3,3,eng,3,3
1262961,23419145,X,X,eng,x,x
302417,884001,M,M,eng,m,m
1943578,26849896,J,J,eng,j,j
1495503,13489087,Y,Y,eng,y,y
555069,25752896,G.,G.,jpn,g,g
612026,8220178,K,K,,k,k
674089,20322529,3,3,,3,3


- looks good to me

**Looking for mod titles < 5**

In [71]:
eng_items.loc[eng_items["mod_title"].str.len() < 5].sample(10)

Unnamed: 0,book_id,title_without_series,title,language_code,mod_title,mod_title_without_series
1825885,552489,Oink?,Oink?,,oink,oink
2069287,30168348,HHhH,HHhH,rus,hhhh,hhhh
1872456,10427422,Naku,Naku,fin,naku,naku
549604,22011697,Rush,Rush,eng,rush,rush
403252,1330342,Aura,Aura,spa,aura,aura
2253429,22320776,Effe,Effe,fre,effe,effe
1113254,6707259,Kick,Kick,,kick,kick
944908,2073930,Dog,Dog,,dog,dog
1615338,1992331,Nino,Nino,,nino,nino
1792878,11747245,Wolf,Wolf,eng,wolf,wolf


- looks good to me

**Exporting only English modified DF**

In [72]:
eng_items.to_parquet('../3_ProcessedData/books_SE_v3.parquet', index=True, compression="snappy")