In [15]:
import pandas as pd
import csv

# Load the dataset
pd.set_option("display.max_colwidth", None)
df_from_csv = pd.read_csv('Dataset/books_0.2.csv')
df_from_csv.head(3)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,column1,column2,column3
0,1,Harry Potter and the Half-Blood Prince (Harry Potter #6),J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,,,
1,2,Harry Potter and the Order of the Phoenix (Harry Potter #5),J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,,,
2,4,Harry Potter and the Chamber of Secrets (Harry Potter #2),J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,,,


In [24]:
df_filtered_rows = df_from_csv.loc[(df_from_csv["column1"].isna())&(df_from_csv["column2"].isna())&(df_from_csv["column3"].isna())]
df = df_filtered_rows.iloc[:,0:-3]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11072 entries, 0 to 11126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   bookID              11072 non-null  int64 
 1   title               11072 non-null  object
 2   authors             11072 non-null  object
 3   average_rating      11072 non-null  object
 4   isbn                11072 non-null  object
 5   isbn13              11072 non-null  object
 6   language_code       11072 non-null  object
 7     num_pages         11072 non-null  object
 8   ratings_count       11072 non-null  object
 9   text_reviews_count  11072 non-null  object
 10  publication_date    11072 non-null  object
 11  publisher           11072 non-null  object
dtypes: int64(1), object(11)
memory usage: 1.1+ MB


## Data analysis part plan:
* Data processing:
    * Parsing the .csv file
    * Correct string in the right position from df.loc[(~df["column1"].isna())] and merge with the df
    * Correct the data types
* Cleaning the data with the help of data exploration

In [20]:
df.rename(columns={"  num_pages": "num_pages"}, inplace=True)
new_types_dict = {"title":"string", "authors":"string", "language_code":"category", "publisher":"category","isbn":"string", "isbn13":"string","average_rating":"float64","num_pages":"int64", "ratings_count":"int64","text_reviews_count":"int64"}

df_formatted = df.astype(new_types_dict)
df_formatted.dtypes

bookID                   int64
title                   string
authors                 string
average_rating         float64
isbn                    string
isbn13                  string
language_code         category
num_pages                int64
ratings_count            int64
text_reviews_count       int64
publication_date        object
publisher             category
dtype: object

In [22]:
df_formatted.select_dtypes(include=["int64","float64"]).describe()

Unnamed: 0,bookID,average_rating,num_pages,ratings_count,text_reviews_count
count,11072.0,11072.0,11072.0,11072.0,11072.0
mean,21295.841944,3.933345,336.215589,17837.55,539.799133
std,13098.978565,0.350768,241.154478,112653.1,2579.386078
min,1.0,0.0,0.0,0.0,0.0
25%,10262.5,3.77,192.0,103.0,9.0
50%,20238.5,3.96,299.0,737.0,46.0
75%,32082.25,4.13,416.0,4920.5,235.0
max,45641.0,5.0,6576.0,4597666.0,94265.0


In [23]:
df_formatted.select_dtypes(include=["object","string","category"]).describe()

Unnamed: 0,title,authors,isbn,isbn13,language_code,publication_date,publisher
count,11072,11072,11072,11072,11072,11072,11072
unique,10301,6625,11072,11072,25,3671,2283
top,The Brothers Karamazov,P.G. Wodehouse,439785960,9780439785969,eng,10/1/2005,Vintage
freq,9,40,1,1,10266,56,318


In [8]:
df.tail(5)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael Hemmingson,4.06,1560254416,9781560254416,eng,512,156,20,12/21/2004,Da Capo Press
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,140110879,9780140110876,eng,635,783,56,12/1/1988,Penguin Books
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,140131965,9780140131963,eng,415,820,95,8/1/1993,Penguin Books
11125,45639,Poor People,William T. Vollmann,3.72,60878827,9780060878825,eng,434,769,139,2/27/2007,Ecco
11126,45641,Las aventuras de Tom Sawyer,Mark Twain,3.91,8497646983,9788497646987,spa,272,113,12,5/28/2006,Edimat Libros
