In [1]:
# imports

import numpy as np
import pandas as pd

In [48]:
df = pd.read_csv("housing.csv")

In [3]:
df.shape

(20640, 10)

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df['households'].dtype

dtype('float64')

In [13]:

"""
    This module helps the user in cleaning data like handling bad values and filling null values
"""

def get_range(df):
    """
    Calculate the range of numerical values
    
    Args:
        df: A data frame
    Returns:
        range_df: A dataframe containing ranges
    """
    range_dict = {}
    for i in df:
        if df[i].dtype == 'float64':
            range_dict[i] = {'min' : df[i].min(), 'max' : df[i].max()}
    df2 = pd.DataFrame(range_dict).T
    return df2

In [14]:
range_num = get_range(df)

In [15]:
range_num

Unnamed: 0,min,max
longitude,-124.35,-114.31
latitude,32.54,41.95
housing_median_age,1.0,52.0
total_rooms,2.0,39320.0
total_bedrooms,1.0,6445.0
population,3.0,35682.0
households,1.0,6082.0
median_income,0.4999,15.0001
median_house_value,14999.0,500001.0


In [16]:
# It has no use as it's a part of description

In [20]:
def get_set(df):
    """
    Return a set of unique values present categorical columns
    
    Args: 
        df: A dataframe
    Returns:
        set_df: Set of categorical values
    """
    set_dict = {}
    for i in df:
        if df[i].dtype == 'O':
            set_dict[i] = df[i].unique().tolist()
    df2 = pd.DataFrame(set_dict)
    return df2

In [21]:
set_cat = get_set(df)

In [22]:
set_cat

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,<1H OCEAN
2,INLAND
3,NEAR OCEAN
4,ISLAND


In [23]:
# The function should return a groupby dict with value counts

In [64]:
def range_constraints(df, range_dict, except_dict):
    """
        Numerical values should be checked with invalid values and range constraints. The bad values will be
        set as null values.
        
        args:
            df: DataFrame
            range_dict: range constraints for numerical values like => {col: (1, 100)}
                        remember the min and max values will be compared with < and >. So value will be set as 
                        1 < val < 100
            except_dict: The values which shouldn't present in the columns like in area of the house column
                         0 is an unacceptable value. So those are set to None.
        returns:
            df: the modified dataframe
    """
    d_df = df.copy(deep=True)
    for i in range_dict:
        if d_df[i].dtype == 'float64':
            d_df.loc[ (d_df[i] < range_dict[i][0]) | (d_df[i]>range_dict[i][0]), i] = np.nan
    for i in except_dict:
        if d_df[i].dtype == 'float64':
            d_df.loc[df[i]==except_dict[i], i] = np.nan
        
    return d_df

In [65]:
range_data = {'longitude':(-180), 'latitude':(-180, 180)}
except_data = {'housing_median_age':0, 'population':0, 'households':0, 'median_income':0, 'median_house_value':0}

In [66]:
d_df = range_constraints(df, range_data, except_data)

In [67]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [68]:
d_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
