In [1]:
import pandas as pd
df = pd.read_csv("book.csv")
df

Unnamed: 0,isbn,title,price
0,1111,lord of the rings,-3500
1,2222,-,4200
2,3333,Ben Hur,x
3,4444,hunyadi,
4,2222,Harry Potter,4200
5,5555,inferno,3200


In [3]:
# Ordering by column
df.sort_values("price", ascending=False)

Unnamed: 0,isbn,title,price
1,2222,-,4200
4,2222,Harry Potter,4200
5,5555,inferno,3200
0,1111,lord of the rings,-3500
2,3333,Ben Hur,x
3,4444,hunyadi,


In [8]:
# Finding duplicates
print(df.duplicated)

<bound method DataFrame.duplicated of    isbn              title  price
0  1111  lord of the rings  -3500
1  2222                  -   4200
2  3333            Ben Hur      x
3  4444            hunyadi    NaN
4  2222       Harry Potter   4200
5  5555            inferno   3200>


In [9]:
# Duplicates on specific column
isbn_dup = df[df.duplicated(['isbn'])]
isbn_dup

Unnamed: 0,isbn,title,price
4,2222,Harry Potter,4200


In [13]:
# Dropping duplicates on "isbn"
df.drop_duplicates(subset =['isbn'], keep = "first", inplace = True)

In [15]:
# Saving result in a new file
df.to_csv("no_duplicates_book.csv")

In [16]:
# Treating missing values
# na_values create a string that considers pandas as NaN
missing_values = ["n/a", "na", "-", "--"]
df = pd.read_csv("no_duplicates_book.csv", na_values = missing_values)

In [17]:
# Total missing values per column
print(df.isnull().sum())

Unnamed: 0    0
isbn          0
title         1
price         1
dtype: int64


In [20]:
df.dtypes

Unnamed: 0     int64
isbn           int64
title         object
price         object
dtype: object

In [23]:
# Correcting prices
import numpy as np

counter = 0
for value in df['price']:
    try:
        int(value)
        if int(value) <= 0 :  # If price is negative
            df.loc[counter, 'price'] = np.nan
    except ValueError:
        df.loc[counter, 'price'] = np.nan
    counter += 1

In [24]:
# Fill missing price values using median 
median = df['price'].median()
df['price'].fillna(median, inplace=True)

In [25]:
df

Unnamed: 0.1,Unnamed: 0,isbn,title,price
0,0,1111,lord of the rings,3700.0
1,1,2222,,4200.0
2,2,3333,Ben Hur,3700.0
3,3,4444,hunyadi,3700.0
4,5,5555,inferno,3200.0


In [26]:
# Dropping NaN rows
df.dropna(inplace=True)
df

Unnamed: 0.1,Unnamed: 0,isbn,title,price
0,0,1111,lord of the rings,3700.0
2,2,3333,Ben Hur,3700.0
3,3,4444,hunyadi,3700.0
4,5,5555,inferno,3200.0


In [28]:
# Lexical conversion
df['title'] = df['title'].str.upper()
df

Unnamed: 0.1,Unnamed: 0,isbn,title,price
0,0,1111,LORD OF THE RINGS,3700.0
2,2,3333,BEN HUR,3700.0
3,3,4444,HUNYADI,3700.0
4,5,5555,INFERNO,3200.0
