# Homework 6: Data Preprocessing

In [1]:
import pandas as pd 
import numpy as np
import os
import sys
from pathlib import Path

processed_path = Path.cwd().parent / "data" / "processed"
raw_path = Path.cwd().parent / "data" / "raw"

data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(str(raw_path), 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')

File already exists at /Users/svolety/Desktop/bootcamp IV/bootcamp/homework/homework6/data/raw/sample_data.csv. Skipping CSV creation to avoid overwrite.


In [2]:
df = pd.read_csv(str(raw_path) + '/sample_data.csv')
df.head()

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,


In [3]:
df

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,
5,,,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,


In [8]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table', 'table style="display:inline;margin-right:20px"'), raw=True)

In [9]:
src_path = Path.cwd().parent / "src"
sys.path.append(str(src_path))
from cleaning import *

df_missing_clean = drop_missing(df, 0.5)
display_side_by_side(df,df_missing_clean)

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,
5,,,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,

Unnamed: 0,age,income,score,zipcode,city
0,34.0,55000.0,0.82,90210,Beverly
1,45.0,,0.91,10001,New York
2,29.0,42000.0,,60614,Chicago
3,50.0,58000.0,0.76,94103,SF
4,38.0,,0.88,73301,Austin
5,,,0.65,12345,Unknown
6,41.0,49000.0,0.79,94105,San Francisco


In [11]:
df_fill_median = fill_missing_median(df_missing_clean, ['income','score','age'])
display_side_by_side(df_missing_clean,df_fill_median)

Unnamed: 0,age,income,score,zipcode,city
0,34.0,55000.0,0.82,90210,Beverly
1,45.0,,0.91,10001,New York
2,29.0,42000.0,,60614,Chicago
3,50.0,58000.0,0.76,94103,SF
4,38.0,,0.88,73301,Austin
5,,,0.65,12345,Unknown
6,41.0,49000.0,0.79,94105,San Francisco

Unnamed: 0,age,income,score,zipcode,city
0,34.0,55000.0,0.82,90210,Beverly
1,45.0,52000.0,0.91,10001,New York
2,29.0,42000.0,0.805,60614,Chicago
3,50.0,58000.0,0.76,94103,SF
4,38.0,52000.0,0.88,73301,Austin
5,39.5,52000.0,0.65,12345,Unknown
6,41.0,49000.0,0.79,94105,San Francisco


In [12]:
df_normalize = normalize_data(df_fill_median,['income','score'])
display_side_by_side(df_fill_median,df_normalize)

Unnamed: 0,age,income,score,zipcode,city
0,34.0,55000.0,0.82,90210,Beverly
1,45.0,52000.0,0.91,10001,New York
2,29.0,42000.0,0.805,60614,Chicago
3,50.0,58000.0,0.76,94103,SF
4,38.0,52000.0,0.88,73301,Austin
5,39.5,52000.0,0.65,12345,Unknown
6,41.0,49000.0,0.79,94105,San Francisco

Unnamed: 0,age,income,score,zipcode,city
0,34.0,0.8125,0.653846,90210,Beverly
1,45.0,0.625,1.0,10001,New York
2,29.0,0.0,0.596154,60614,Chicago
3,50.0,1.0,0.423077,94103,SF
4,38.0,0.625,0.884615,73301,Austin
5,39.5,0.625,0.0,12345,Unknown
6,41.0,0.4375,0.538462,94105,San Francisco


In [16]:
csv_path_processed = os.path.join(str(processed_path), 'sample_processed_data.csv')

if not os.path.exists(csv_path_processed):
    df_normalize.to_csv(csv_path_processed, index=False)
    print(f'Sample dataset created and saved to {csv_path_processed}')
else:
    print(f'File already exists at {csv_path_processed}. Skipping CSV creation to avoid overwrite.')

Sample dataset created and saved to /Users/svolety/Desktop/bootcamp IV/bootcamp/homework/homework6/data/processed/sample_processed_data.csv
