In [6]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


File already exists at ../data/raw\sample_data.csv. Skipping CSV creation to avoid overwrite.


In [10]:
import pandas as pd
from src import cleaning as cl

In [11]:
df = pd.read_csv('../data/raw/sample_data.csv')
df.head()

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,


In [13]:
df1 = cl.fill_missing_median(df)
print("Filled Missing:\n", df1)


df2 = cl.drop_missing(df, thresh=0.3)
print("Dropped Missing:\n", df2)

df3 = cl.normalize_data(df1)
print("Normalized:\n", df3)

Filled Missing:
     age   income  score  zipcode           city  extra_data
0  34.0  55000.0  0.820    90210        Beverly        23.5
1  45.0  52000.0  0.910    10001       New York        42.0
2  29.0  42000.0  0.805    60614        Chicago        23.5
3  50.0  58000.0  0.760    94103             SF        23.5
4  38.0  52000.0  0.880    73301         Austin        23.5
5  39.5  52000.0  0.650    12345        Unknown         5.0
6  41.0  49000.0  0.790    94105  San Francisco        23.5
Dropped Missing:
     age  score  zipcode           city
0  34.0   0.82    90210        Beverly
1  45.0   0.91    10001       New York
3  50.0   0.76    94103             SF
4  38.0   0.88    73301         Austin
6  41.0   0.79    94105  San Francisco
Normalized:
         age  income     score   zipcode           city  extra_data
0  0.238095  0.8125  0.653846  0.953688        Beverly         0.5
1  0.761905  0.6250  1.000000  0.000000       New York         1.0
2  0.000000  0.0000  0.596154  0.6017

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

In [14]:
df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)