# Cleaning Data in Pandas DataFrames

## 1. Import libraries and dependencies

In [1]:
# Import the pandas and pathlib libraries
import pandas as pd
from pathlib import Path

## 2. Create a Path to the File Using Pathlib

In [5]:
# Use the Pathlib libary to set the path to the CSV
csv_path = Path("Resources/data.csv")

## 3. Read the CSV into a Pandas DataFrame

In [6]:
# Use the file path to read the CSV into a DataFrame and display a few rows
nsw_prop_df = pd.read_csv(csv_path)
nsw_prop_df.head()

Unnamed: 0,year,month,suburb,medianSoldPrice,numberSold,highestSoldPrice,lowestSoldPrice,medianSaleListingPrice,numberSaleListing,highestSaleListingPrice,lowestSaleListingPrice,auctionNumberAuctioned,auctionNumberSold,medianRentListingPrice
0,2011.0,8.0,Randwick,1422000.0,40.0,5450000.0,690000.0,1578000.0,81.0,8000000.0,775000.0,32.0,20.0,950.0
1,2011.0,11.0,Randwick,1417000.0,49.0,3660000.0,880000.0,1600000.0,97.0,6000000.0,700000.0,47.0,23.0,880.0
2,2012.0,2.0,Randwick,1520000.0,24.0,6200000.0,890000.0,1600000.0,62.0,5850000.0,570000.0,10.0,5.0,975.0
3,2012.0,5.0,Randwick,1555000.0,44.0,3325000.0,565000.0,1550000.0,77.0,3590000.0,570000.0,30.0,17.0,850.0
4,2012.0,8.0,Randwick,1385000.0,27.0,3050000.0,703000.0,1500000.0,57.0,4250000.0,650000.0,19.0,11.0,825.0


## 4. Rename Columns

In [8]:
nsw_prop_df.columns

Index(['year', 'month', 'suburb', 'medianSoldPrice', 'numberSold',
       'highestSoldPrice', 'lowestSoldPrice', 'medianSaleListingPrice',
       'numberSaleListing', 'highestSaleListingPrice',
       'lowestSaleListingPrice', 'auctionNumberAuctioned', 'auctionNumberSold',
       'medianRentListingPrice'],
      dtype='object')

In [9]:
columns = ['Year', 'Month', 'Suburb', 'Median_Sale_Price', 'Number_Sold',
       'Highest_Sold_Price', 'Lowest_Sold_Price', 'Median_Sale_Listing_Price',
       'Number_Sale_Listing', 'Highest_Sale_Listing_Price',
       'Lowest_Sale_Listing_Price', 'Auction_Number', 'Auction_Sold',
       'Median_Rent_Listing_Price']
nsw_prop_df.columns = columns

nsw_prop_df.head()

Unnamed: 0,Year,Month,Suburb,Median_Sale_Price,Number_Sold,Highest_Sold_Price,Lowest_Sold_Price,Median_Sale_Listing_Price,Number_Sale_Listing,Highest_Sale_Listing_Price,Lowest_Sale_Listing_Price,Auction_Number,Auction_Sold,Median_Rent_Listing_Price
0,2011.0,8.0,Randwick,1422000.0,40.0,5450000.0,690000.0,1578000.0,81.0,8000000.0,775000.0,32.0,20.0,950.0
1,2011.0,11.0,Randwick,1417000.0,49.0,3660000.0,880000.0,1600000.0,97.0,6000000.0,700000.0,47.0,23.0,880.0
2,2012.0,2.0,Randwick,1520000.0,24.0,6200000.0,890000.0,1600000.0,62.0,5850000.0,570000.0,10.0,5.0,975.0
3,2012.0,5.0,Randwick,1555000.0,44.0,3325000.0,565000.0,1550000.0,77.0,3590000.0,570000.0,30.0,17.0,850.0
4,2012.0,8.0,Randwick,1385000.0,27.0,3050000.0,703000.0,1500000.0,57.0,4250000.0,650000.0,19.0,11.0,825.0


## 5. Re-order Columns

In [10]:
nsw_prop_df.columns

Index(['Year', 'Month', 'Suburb', 'Median_Sale_Price', 'Number_Sold',
       'Highest_Sold_Price', 'Lowest_Sold_Price', 'Median_Sale_Listing_Price',
       'Number_Sale_Listing', 'Highest_Sale_Listing_Price',
       'Lowest_Sale_Listing_Price', 'Auction_Number', 'Auction_Sold',
       'Median_Rent_Listing_Price'],
      dtype='object')

In [11]:
nsw_prop_df = nsw_prop_df[['Suburb', 'Year', 'Month', 'Median_Sale_Price', 'Number_Sold',
       'Highest_Sold_Price', 'Lowest_Sold_Price', 'Median_Sale_Listing_Price',
       'Number_Sale_Listing', 'Highest_Sale_Listing_Price',
       'Lowest_Sale_Listing_Price', 'Auction_Number', 'Auction_Sold',
       'Median_Rent_Listing_Price']]

nsw_prop_df.head()

Unnamed: 0,Suburb,Year,Month,Median_Sale_Price,Number_Sold,Highest_Sold_Price,Lowest_Sold_Price,Median_Sale_Listing_Price,Number_Sale_Listing,Highest_Sale_Listing_Price,Lowest_Sale_Listing_Price,Auction_Number,Auction_Sold,Median_Rent_Listing_Price
0,Randwick,2011.0,8.0,1422000.0,40.0,5450000.0,690000.0,1578000.0,81.0,8000000.0,775000.0,32.0,20.0,950.0
1,Randwick,2011.0,11.0,1417000.0,49.0,3660000.0,880000.0,1600000.0,97.0,6000000.0,700000.0,47.0,23.0,880.0
2,Randwick,2012.0,2.0,1520000.0,24.0,6200000.0,890000.0,1600000.0,62.0,5850000.0,570000.0,10.0,5.0,975.0
3,Randwick,2012.0,5.0,1555000.0,44.0,3325000.0,565000.0,1550000.0,77.0,3590000.0,570000.0,30.0,17.0,850.0
4,Randwick,2012.0,8.0,1385000.0,27.0,3050000.0,703000.0,1500000.0,57.0,4250000.0,650000.0,19.0,11.0,825.0


## 6. View Column Data Types

In [7]:
# Use the `dtypes` attribute to list the column data types
nsw_prop_df.dtypes

year                       float64
month                      float64
suburb                      object
medianSoldPrice            float64
numberSold                 float64
highestSoldPrice           float64
lowestSoldPrice            float64
medianSaleListingPrice     float64
numberSaleListing          float64
highestSaleListingPrice    float64
lowestSaleListingPrice     float64
auctionNumberAuctioned     float64
auctionNumberSold          float64
medianRentListingPrice     float64
dtype: object

## 5. Drop Extraneous Columns

In [None]:
# Use the `drop` function to drop specific columns
people_df.drop(columns=['Unnamed: 0'], inplace=True)
people_df.head()

---

## 6. Identify Data Quality Issues

### 1. Identify the Number of Rows

In [None]:
# Use the `count` function to view count of non-null values for each column
people_df.count()

### 2. Identify Frequency Counts of a Specific Column

In [None]:
# Identifying frequency counts of the `first_name` column
people_df['First_Name'].value_counts()

### 3. Identify Null Values

In [None]:
# Checking for null
people_df.isnull()

In [None]:
False==0

### 4. Determine the Number of Nulls

In [None]:
# Determining number of nulls
people_df.isnull().sum()

### 5. Determining the Percentage of Nulls for each Column

In [None]:
# Determining percentage of nulls
people_df.isnull().sum() / len(people_df) * 100

### 6. Check for Duplicate Rows

In [None]:
# Use the `duplicated` function to determine the existance of duplicate rows: True or False
people_df.duplicated()

### 7. Check for Duplicate `first_name` Values

In [None]:
# Use the `duplicated` function in conjunction with a list of columns to 
# determine the existence of duplicate rows based on the selected columns
people_df[['First_Name', 'Last_Name']].duplicated()

---

## 7. Resolve Data Quality Issues

### 1. Fill First_Name and Last_Name Null Values with Default Value "Unnamed"

In [None]:
# Cleanse nulls from DataFrame by filling na
people_df['First_Name'] = people_df['First_Name'].fillna("Unnamed")
people_df['Last_Name'] = people_df['Last_Name'].fillna("Unnamed")
people_df

### 2. Drop Remaining Records with Nulls from DataFrame

In [None]:
# Use the `dropna` function to drop whole records that have at least one null value
people_df.dropna(inplace=True)
people_df

### 3. Check Null Counts for Each Column (Again)

In [None]:
# Use the `isnull` function in conjunction with the `sum` function to count the number of null values for each column
people_df.isnull().sum()

### 4. Cleanse data by Dropping Duplicates

In [None]:
# Use the `drop_duplicates` function with the `subset` parameter to 
# drop duplicates based on a selection of columns
people_df.drop_duplicates(subset=['Last_Name', 'First_Name'])

### 5. Convert Columns to Different DataTypes

In [None]:
# Use the `as_type` function to convert `Person_ID` from `float` to `int`
people_df['Person_ID'] = people_df['Person_ID'].astype('int')
people_df

## 8. Save Cleansed Data to New CSV

In [None]:
# Save modified DataFrame to the Resources folder. 
# Use the `index` parameter set to `False` to exclude saving the index.
people_df.to_csv("../Resources/people_cleansed.csv", index=False)