# Data Processing

Goal: Understand and clean our data so we can derive better insights

## 1. Import Libraries

In [20]:
import pandas as pd

## 2. Load the Dataset

In [21]:
df = pd.read_csv("data/NY-House-Dataset-Small.csv")

In [22]:
df.info()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   BROKERTITLE                  1789 non-null   object 
 1   TYPE                         4815 non-null   object 
 2   PRICE                        4815 non-null   int64  
 3   BEDS                         4815 non-null   int64  
 4   BATH                         4815 non-null   float64
 5   PROPERTYSQFT                 4815 non-null   float64
 6   STATE                        4815 non-null   object 
 7   MAIN_ADDRESS                 4815 non-null   object 
 8   ADMINISTRATIVE_AREA_LEVEL_2  2135 non-null   object 
 9   LOCALITY                     4791 non-null   object 
 10  SUBLOCALITY                  4815 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 413.9+ KB


Index(['BROKERTITLE', 'TYPE', 'PRICE', 'BEDS', 'BATH', 'PROPERTYSQFT', 'STATE',
       'MAIN_ADDRESS', 'ADMINISTRATIVE_AREA_LEVEL_2', 'LOCALITY',
       'SUBLOCALITY'],
      dtype='object')

## 3. Handle Duplicates

In [23]:
# Find the duplicated rows
df.duplicated().sum()

# Find duplicates by specfific column
df.duplicated(["MAIN_ADDRESS"]).sum()

# Filter to get duplicted rowsss
df.loc[df.duplicated()]

# Display all of the duplicates
df.loc[df.duplicated(keep=False)].sort_values(["PRICE"])

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY
3639,,Co-op for sale,119000,3,1.000000,2184.207862,"Jamaica, NY 11432","89-00 170 St Unit 11NJamaica, NY 11432",New York,Queens County,Queens
3629,,Co-op for sale,119000,3,1.000000,2184.207862,"Jamaica, NY 11432","89-00 170 St Unit 11NJamaica, NY 11432",New York,Queens County,Queens
1520,,Co-op for sale,174000,1,1.000000,800.000000,"Brooklyn, NY 11229","3105 Avenue V Apt 1HBrooklyn, NY 11229",,Kings County,Brooklyn
1522,Brokered by TRACEY REAL ESTATE,Co-op for sale,174000,1,1.000000,800.000000,"Brooklyn, NY 11229","3105 Avenue V Apt 1HBrooklyn, NY 11229",New York,Kings County,Brooklyn
2128,,Co-op for sale,174000,1,1.000000,800.000000,"Brooklyn, NY 11229","3105 Avenue V Apt 1HBrooklyn, NY 11229",,Kings County,Brooklyn
...,...,...,...,...,...,...,...,...,...,...,...
2678,Brokered by Keller Williams Realty NYC Grp,Multi-family home for sale,3200000,3,2.373861,3735.000000,"New York, NY 10035","2117 5th AveNew York, NY 10035",,New York,New York County
3469,,Condo for sale,7600000,4,4.000000,3216.000000,"New York, NY 10007","100 Barclay St Apt 20CNew York, NY 10007",,New York County,New York
3473,,Condo for sale,7600000,4,4.000000,3216.000000,"New York, NY 10007","100 Barclay St Apt 20CNew York, NY 10007",,New York County,New York
2355,,Multi-family home for sale,16995000,5,4.000000,4230.000000,"New York, NY 10014","31 Grove StNew York, NY 10014",,New York,New York County


In [24]:
# Drop duplicates. Keeps the first by default
df = df.drop_duplicates()

In [25]:
df.shape

(4761, 11)

## 4. Handle Missing Data

#### Generally Dropping Data

In [26]:
# Does a cell have null values
df.isna()

df.notna()

# Get all columns with null values
df.isna().any()

# Get all rows with null values
df.isna().any(axis=1)

df.loc[df.isna().any(axis=1)]

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY
2,Brokered by Sowae Corp,House for sale,260000,4,2.000000,2015.000000,"Staten Island, NY 10312","620 Sinclair AveStaten Island, NY 10312",,New York,Richmond County
3,Brokered by COMPASS,Condo for sale,69000,3,1.000000,445.000000,"Manhattan, NY 10022","2 E 55th St Unit 908W33Manhattan, NY 10022",,New York,New York County
4,Brokered by Sotheby's International Realty - E...,Townhouse for sale,55000000,7,2.373861,14175.000000,"New York, NY 10065","5 E 64th StNew York, NY 10065",,New York,New York County
6,,Condo for sale,899500,2,2.000000,2184.207862,"New York, NY 10027","157 W 126th St Unit 1BNew York, NY 10027",,New York County,New York
7,Brokered by Connie Profaci Realty,House for sale,16800000,8,16.000000,33000.000000,"Staten Island, NY 10304","177 Benedict RdStaten Island, NY 10304",,New York,Richmond County
...,...,...,...,...,...,...,...,...,...,...,...
4806,,Multi-family home for sale,1700000,3,7.000000,7854.000000,"Brooklyn, NY 11232","448 40th StBrooklyn, NY 11232",,New York,Kings County
4807,Brokered by Engel & V�������,Co-op for sale,1950000,2,4.000000,2184.207862,"New York, NY 10021","700 Park Ave Unit 20ANew York, NY 10021",,New York County,New York
4811,,Co-op for sale,245000,1,1.000000,2184.207862,"Rego Park, NY 11374","97-40 62 Dr Unit LgRego Park, NY 11374",United States,New York,Queens County
4812,,Co-op for sale,1275000,1,1.000000,2184.207862,"New York, NY 10011","427 W 21st St Unit GardenNew York, NY 10011",United States,New York,New York County


In [27]:
# Drop all rows with null values
df.dropna()

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY
0,Brokered by Douglas Elliman -111 Fifth Ave,Condo for sale,315000,2,2.000000,1400.000000,"New York, NY 10022","2 E 55th St Unit 803New York, NY 10022",New York County,New York,Manhattan
1,Brokered by Serhant,Condo for sale,195000000,7,10.000000,17545.000000,"New York, NY 10019",Central Park Tower Penthouse-217 W 57th New Yo...,United States,New York,New York County
5,Brokered by Sowae Corp,House for sale,690000,5,2.000000,4004.000000,"Brooklyn, NY 11238","584 Park PlBrooklyn, NY 11238",United States,New York,Kings County
24,Brokered by Elizabeth Marra,House for sale,750000,2,2.000000,1330.000000,"Staten Island, NY 10307","280 Loretto StStaten Island, NY 10307",United States,New York,Richmond County
28,Brokered by Du Chris Realty,Co-op for sale,230000,1,1.000000,2184.207862,"Jackson Heights, NY 11372","33-24 Junction Blvd Unit 6RJackson Heights, NY...",United States,New York,Queens County
...,...,...,...,...,...,...,...,...,...,...,...
4802,Brokered by J Shayovitz Real Estate Corp,Co-op for sale,4750000,3,2.373861,2969.000000,"New York, NY 10128","1175 Park Ave Unit 1CNew York, NY 10128",New York,New York County,New York
4808,Brokered by Brown Harris Stevens,Condo for sale,945000,2,2.000000,903.000000,"Manhattan, NY 10030",2351 Adam Clayton Powell Jr Blvd Apt 614Manhat...,New York,New York County,New York
4809,Brokered by RE MAX Edge,Multi-family home for sale,2999999,15,9.000000,3600.000000,"Brooklyn, NY 11224","2825-2827 W 15th StBrooklyn, NY 11224",United States,New York,Kings County
4810,Brokered by COMPASS,Co-op for sale,599000,1,1.000000,2184.207862,"Manhattan, NY 10075","222 E 80th St Apt 3AManhattan, NY 10075",New York,New York County,New York


In [28]:
# Drop all columns with null values
df.dropna(axis=1)

Unnamed: 0,TYPE,PRICE,BEDS,BATH,PROPERTYSQFT,STATE,MAIN_ADDRESS,SUBLOCALITY
0,Condo for sale,315000,2,2.000000,1400.000000,"New York, NY 10022","2 E 55th St Unit 803New York, NY 10022",Manhattan
1,Condo for sale,195000000,7,10.000000,17545.000000,"New York, NY 10019",Central Park Tower Penthouse-217 W 57th New Yo...,New York County
2,House for sale,260000,4,2.000000,2015.000000,"Staten Island, NY 10312","620 Sinclair AveStaten Island, NY 10312",Richmond County
3,Condo for sale,69000,3,1.000000,445.000000,"Manhattan, NY 10022","2 E 55th St Unit 908W33Manhattan, NY 10022",New York County
4,Townhouse for sale,55000000,7,2.373861,14175.000000,"New York, NY 10065","5 E 64th StNew York, NY 10065",New York County
...,...,...,...,...,...,...,...,...
4810,Co-op for sale,599000,1,1.000000,2184.207862,"Manhattan, NY 10075","222 E 80th St Apt 3AManhattan, NY 10075",New York
4811,Co-op for sale,245000,1,1.000000,2184.207862,"Rego Park, NY 11374","97-40 62 Dr Unit LgRego Park, NY 11374",Queens County
4812,Co-op for sale,1275000,1,1.000000,2184.207862,"New York, NY 10011","427 W 21st St Unit GardenNew York, NY 10011",New York County
4813,Condo for sale,598125,2,1.000000,655.000000,"Elmhurst, NY 11373","91-23 Corona Ave Unit 4GElmhurst, NY 11373",Queens


## 5. Missing Data By Column
Steps:
1. Use Descriptive Statistics to examine data
2. Identify missing values
3. Understand why the data is missing
4. Decide to impute or drop values
5. Document your approach


### Broker Title

In [29]:
# Check vaules of the column
df["BROKERTITLE"].head(20)

# Figure out how many missing values
df["BROKERTITLE"].isna().sum()

# Determine the percentage of missing values
num_rows = df.shape[0]
missing_broker_rows = df["BROKERTITLE"].isna().sum()
print((missing_broker_rows/num_rows) * 100)

62.71791640411678


#### Conclusion

We will drop the column BROKERTITLE as more than 50% of the data is missing and it's not required for the remainder of our analysis. 

In [30]:
# Drop the entire column
df = df.drop("BROKERTITLE", axis=1)

# Homework

## Question #1

In [31]:
missing_rows = df["ADMINISTRATIVE_AREA_LEVEL_2"].isna().sum()
print((missing_rows/num_rows) * 100)

55.66057550934678


### Conclusion 
I would drop the column because more thatn 50% of the data is missing and it is not a very neccesary column either

## Question 2

In [32]:
df["LOCALITY"].isna().sum()

24

In [33]:
# Map missing values to None
def locality_to_none(word):
    if word in ["Na", "-", "United States"]:
        return None
    return word

df["LOCALITY"] = df["LOCALITY"].map(locality_to_none)
df["LOCALITY"].value_counts()



LOCALITY
New York           2447
New York County     955
Queens County       544
Kings County        453
Bronx County        178
Richmond County      58
Brooklyn              6
Queens                6
The Bronx             3
Flatbush              1
Name: count, dtype: int64

In [34]:
# Count missing data
df["LOCALITY"].isna().sum()

110

I would most likely remove these value as it would be hard to group data without knowing these values. If the address was known you could possible find the locality that way. 

## Question 3

In [43]:
df["PRICE"].value_counts()

def price_to_none(num):
    if num == 0:
        return None
    return num

df["PRICE"] = df["PRICE"].map(price_to_none)
df["PRICE"].value_counts()

df["PRICE"].isna().sum()

79

If we were unable to look up the price of these properties I would estimate a value to fill in with regards to the surrounding properites of similar charachteristicts and their actual values. 