In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read csv
data = pd.read_csv("dirty_data.csv")
data.head()

Unnamed: 0,id,name,address,height,birthdate,eyes
0,125500.0,,"94932 Amanda Way\nSouth Cassandraville, WV 15612",158.0,1997-6-21,brown
1,168840.0,Kenneth Waters,"510 Sean Mall Apt. 205\nFarmershire, UT 20150",81.0,1966-10-23,brown
2,224772.0,Nicole Brown,"56984 Edward Parkway\nSchmidtmouth, MA 41988",143.0,1962-9-17,amber
3,254365.0,,"1681 Nicole Ridge Apt. 523\nNew Casey, MH 82120",171.0,1958-12-2,green
4,749255.0,Debbie Murray,59986 Christopher Squares Suite 667\nChristoph...,88.0,1952-12-5,amber


In [3]:
# Summary info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         481 non-null    float64
 1   name       487 non-null    object 
 2   address    491 non-null    object 
 3   height     486 non-null    float64
 4   birthdate  480 non-null    object 
 5   eyes       492 non-null    object 
dtypes: float64(2), object(4)
memory usage: 24.5+ KB


In [4]:
# Get nan counts using isna and sum
data.isna().sum()

id           39
name         33
address      29
height       34
birthdate    40
eyes         28
dtype: int64

In [5]:
# Drop nans in certain columns, printing length before/after
print(len(data))
data = data.dropna(subset=["id", "name", "address", "birthdate"])
print(len(data))

520
390


In [6]:
# Remove duplicate ids
data = data.drop_duplicates(subset="id")
print(len(data))

375


In [7]:
data.head()

Unnamed: 0,id,name,address,height,birthdate,eyes
1,168840.0,Kenneth Waters,"510 Sean Mall Apt. 205\nFarmershire, UT 20150",81.0,1966-10-23,brown
2,224772.0,Nicole Brown,"56984 Edward Parkway\nSchmidtmouth, MA 41988",143.0,1962-9-17,amber
4,749255.0,Debbie Murray,59986 Christopher Squares Suite 667\nChristoph...,88.0,1952-12-5,amber
6,555484.0,Michael Rodriguez,"7156 Jackson Turnpike\nGrayburgh, CA 61906",227.0,1957-7-23,brown
7,250463.0,Michael Bishop,"585 Newman Ville Suite 183\nPort Jillton, NJ 2...",226.0,1972-11-22,blue


In [8]:
# Eye color value counts
data["eyes"].value_counts()

brown    93
amber    91
green    88
blue     81
Name: eyes, dtype: int64

In [9]:
# Eye color nan count
data["eyes"].isna().sum()

22

In [10]:
# Fill nans in height with mean, eyes with mode
data = data.fillna({"height": data["height"].mean(), "eyes": data["eyes"].mode()[0]})

In [11]:
data.head()

Unnamed: 0,id,name,address,height,birthdate,eyes
1,168840.0,Kenneth Waters,"510 Sean Mall Apt. 205\nFarmershire, UT 20150",81.0,1966-10-23,brown
2,224772.0,Nicole Brown,"56984 Edward Parkway\nSchmidtmouth, MA 41988",143.0,1962-9-17,amber
4,749255.0,Debbie Murray,59986 Christopher Squares Suite 667\nChristoph...,88.0,1952-12-5,amber
6,555484.0,Michael Rodriguez,"7156 Jackson Turnpike\nGrayburgh, CA 61906",227.0,1957-7-23,brown
7,250463.0,Michael Bishop,"585 Newman Ville Suite 183\nPort Jillton, NJ 2...",226.0,1972-11-22,blue


In [12]:
# Function to convert cm to ft
def cm_to_feet(cm):
    return cm * 0.033

# Convert height from cm to ft
data["height"] = data["height"].apply(cm_to_feet)
data.head()

Unnamed: 0,id,name,address,height,birthdate,eyes
1,168840.0,Kenneth Waters,"510 Sean Mall Apt. 205\nFarmershire, UT 20150",2.673,1966-10-23,brown
2,224772.0,Nicole Brown,"56984 Edward Parkway\nSchmidtmouth, MA 41988",4.719,1962-9-17,amber
4,749255.0,Debbie Murray,59986 Christopher Squares Suite 667\nChristoph...,2.904,1952-12-5,amber
6,555484.0,Michael Rodriguez,"7156 Jackson Turnpike\nGrayburgh, CA 61906",7.491,1957-7-23,brown
7,250463.0,Michael Bishop,"585 Newman Ville Suite 183\nPort Jillton, NJ 2...",7.458,1972-11-22,blue


In [13]:
# Replace newline tokens in address with comma and space
data["address"] = [address.replace("\n", ", ") for address in data["address"]]
data.head()

Unnamed: 0,id,name,address,height,birthdate,eyes
1,168840.0,Kenneth Waters,"510 Sean Mall Apt. 205, Farmershire, UT 20150",2.673,1966-10-23,brown
2,224772.0,Nicole Brown,"56984 Edward Parkway, Schmidtmouth, MA 41988",4.719,1962-9-17,amber
4,749255.0,Debbie Murray,"59986 Christopher Squares Suite 667, Christoph...",2.904,1952-12-5,amber
6,555484.0,Michael Rodriguez,"7156 Jackson Turnpike, Grayburgh, CA 61906",7.491,1957-7-23,brown
7,250463.0,Michael Bishop,"585 Newman Ville Suite 183, Port Jillton, NJ 2...",7.458,1972-11-22,blue


In [14]:
# Make ID int
data["id"] = data["id"].astype(int)

# Convert bday to datetime and format
data["birthdate"] = pd.to_datetime(data["birthdate"])
data["birthdate"] = data["birthdate"].dt.strftime('%d/%m/%Y')

data.head()

Unnamed: 0,id,name,address,height,birthdate,eyes
1,168840,Kenneth Waters,"510 Sean Mall Apt. 205, Farmershire, UT 20150",2.673,23/10/1966,brown
2,224772,Nicole Brown,"56984 Edward Parkway, Schmidtmouth, MA 41988",4.719,17/09/1962,amber
4,749255,Debbie Murray,"59986 Christopher Squares Suite 667, Christoph...",2.904,05/12/1952,amber
6,555484,Michael Rodriguez,"7156 Jackson Turnpike, Grayburgh, CA 61906",7.491,23/07/1957,brown
7,250463,Michael Bishop,"585 Newman Ville Suite 183, Port Jillton, NJ 2...",7.458,22/11/1972,blue


In [15]:
# Now it is clean
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375 entries, 1 to 498
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         375 non-null    int64  
 1   name       375 non-null    object 
 2   address    375 non-null    object 
 3   height     375 non-null    float64
 4   birthdate  375 non-null    object 
 5   eyes       375 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 20.5+ KB
