# Data Scrubbing

#### 1) Importing libraries and functions.

In [1]:
import pandas as pd
import numpy as np
import Functions
from Functions import remove_dupes, percent_null_df, percent_null_col, df_snapshot, find_nans, determine_dtype, make_ints

#### 2) Reading in the .csv with pandas. Creating a copy to avoid error messages.

In [2]:
x = pd.read_csv('https://raw.githubusercontent.com/snepaul179/Real_EstateModel-onl01-dtsc-pt-052620/master/kc_house_data.csv')
data = x.copy()

#### 3) Examining the first few rows.

In [3]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


#### 4) From the preview of the dataframe, we can see there is a column called 'id'.'
We can assume that the id is the identification number. Using id we can remove all of duplicates from the dataframe because we are presuming that id numbers are unique for each data point.

#### 5) Next, I will use a custom function to use 'id' as the criteria to remove duplicates by keeping only the first instance. As well as printing how many duplicates were removed, returning the modified dataframe.

In [4]:
remove_dupes(data,'id')

177 duplicates removed. 



Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
2495,1000102,4/22/2015,300000.0,6,3.00,2400,9373,2.0,0.0,0.0,...,7,2400,0.0,1991,0.0,98002,47.3262,-122.214,2060,7316
6729,1200019,5/8/2014,647500.0,4,1.75,2060,26036,1.0,,0.0,...,8,1160,900.0,1947,0.0,98166,47.4444,-122.351,2590,21891
8404,1200021,8/11/2014,400000.0,3,1.00,1460,43000,1.0,0.0,0.0,...,7,1460,0.0,1952,0.0,98166,47.4434,-122.347,2250,20023
8800,2800031,4/1/2015,235000.0,3,1.00,1430,7599,1.5,0.0,0.0,...,6,1010,420.0,1930,0.0,98168,47.4783,-122.265,1290,10320
3553,3600057,3/19/2015,402500.0,4,2.00,1650,3504,1.0,0.0,0.0,...,7,760,890.0,1951,2013.0,98144,47.5803,-122.294,1480,3504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16723,9842300095,7/25/2014,365000.0,5,2.00,1600,4168,1.5,0.0,0.0,...,7,1600,0.0,1927,0.0,98126,47.5297,-122.381,1190,4168
3257,9842300485,3/11/2015,380000.0,2,1.00,1040,7372,1.0,0.0,0.0,...,7,840,200.0,1939,0.0,98126,47.5285,-122.378,1930,5150
7614,9842300540,6/24/2014,339000.0,3,1.00,1100,4128,1.0,0.0,0.0,...,7,720,380.0,1942,,98126,47.5296,-122.379,1510,4538
20963,9895000040,7/3/2014,399900.0,2,1.75,1410,1005,1.5,0.0,0.0,...,9,900,510.0,2011,0.0,98027,47.5446,-122.018,1440,1188


#### 6) Next, I want to set the index to id.

In [5]:
data.set_index('id', inplace=True)

#### 7) Now, I will run the function find_nans, to identify the columns with nan values, and return the number of nans as well as a list of the unique values, and number of unique values.

In [6]:
find_nans(data)

waterfront has  2345 NaN values.
waterfront has  3 unique values.
[0.0, nan, 1.0] 

view has  63 NaN values.
view has  6 unique values.
[0.0, 1.0, 2.0, 3.0, 4.0, nan] 

yr_renovated has  3813 NaN values.
yr_renovated has  71 unique values.
[0.0, 1934.0, 1944.0, 1945.0, 1946.0, 1948.0, 1950.0, 1951.0, 1953.0, 1954.0, 1956.0, 1957.0, 1958.0, 1959.0, 1962.0, 1963.0, 1964.0, 1965.0, 1967.0, 1968.0, 1969.0, 1971.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1980.0, 1981.0, 1984.0, 1985.0, 1987.0, 1988.0, 1993.0, 1998.0, 2002.0, 2008.0, 2009.0, 2011.0, 2013.0, nan, 1940.0, 1955.0, 1960.0, 1970.0, 1972.0, 1973.0, 1979.0, 1982.0, 1983.0, 1986.0, 1989.0, 1990.0, 1991.0, 1992.0, 1994.0, 1995.0, 1996.0, 1997.0, 1999.0, 2000.0, 2001.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2010.0, 2012.0, 2014.0, 2015.0] 



#### 8) In the 'waterfront' column, 2000+ values are missing. 
- A) I will segment the dataframe to exclude 'waterfront' because the data is unavailable. 
- B) I will make another dataframe that includes only values for which waterfront data is included. 
- C) I suspect that if we were to inspect the data by zip code, it would be present only in certain coastal zipcodes and not in landlocked zip codes. I think there is potential to do a nice visualization with a choropleth map, as well as examining the features of each zipcode further.

In [7]:
wf_data = data[data.waterfront.notna()]
data = data.drop(columns=['waterfront'])

#### 9) 'veiw' only has 63 NaNs, so we can probably eliminate those rows from the data. 
- A) But just for fun, I'll keep a dataframe with view data in tact in case we are working on an analysis in which view is unimportant, and can be dropped.

In [8]:
data_wo_view = data.drop('view', axis=1)
data = data[data.view.notna()]

#### 10) Need to look further into 'yr_renovated.'
- A) 'yr_renovated' has a few values that are strange. There are some that have a 0, which I believe indicates they have never been renovated. I'm curious as to why some are NaNs and some are zeroes. I'm going to test and see if this has something do with the age of the home.

In [9]:
yr_reno_nans = data[data.yr_renovated.isna()]
print(yr_reno_nans.yr_built.min())
print(yr_reno_nans.yr_built.max())

1900
2015


- B) There does not seem to be a relationship with the age, as all ages seem to be represented. Now, I'll take a look at the subset of the data where yr_renovated is 0, and non NaN.

In [10]:
year_reno_zeroes = data[data.yr_renovated == 0.0]
print(year_reno_zeroes.yr_built.min())
print(year_reno_zeroes.yr_built.max())

1900
2015


- C) Perhaps there is a relationship with zipcode?

In [11]:
yr_reno_nans = data[data.yr_renovated.isna()]
print(len(yr_reno_nans.zipcode.unique()))

year_reno_zeroes = data[data.yr_renovated == 0.0]
print(len(year_reno_zeroes.zipcode.unique()))

70
70


- D) I don't see a relationship between them. So for now, I will create a subset of the data that excludes NaN values, in case we need to analyze a relationship between renovations and price. 

In [12]:
data['yr_renovated'] = data.yr_renovated.fillna(0.0)

#### 11) A quick check to make sure all the NaNs are gone.

In [13]:
find_nans(data)

#### 12) Next, I'm going to use a function called determine_dtype to print an organized of what kind of data types we are dealing with, so we can make decisions about data types that should be changed.

In [14]:
determine_dtype(data)

Objects: 
 ['date', 'sqft_basement'] 

Integers: 
 ['bedrooms', 'sqft_living', 'sqft_lot', 'condition', 'grade', 'sqft_above', 'yr_built', 'zipcode', 'sqft_living15', 'sqft_lot15'] 

Floats:
 ['price', 'bathrooms', 'floors', 'view', 'yr_renovated', 'lat', 'long'] 



- A) I only need to change data types if something is an object that shouldn't be an object, or a float when it doesn't need to be. I take a look at head one more time to get a since of what the values look like.

In [15]:
data.head()

Unnamed: 0_level_0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1000102,4/22/2015,300000.0,6,3.0,2400,9373,2.0,0.0,3,7,2400,0.0,1991,0.0,98002,47.3262,-122.214,2060,7316
1200019,5/8/2014,647500.0,4,1.75,2060,26036,1.0,0.0,4,8,1160,900.0,1947,0.0,98166,47.4444,-122.351,2590,21891
1200021,8/11/2014,400000.0,3,1.0,1460,43000,1.0,0.0,3,7,1460,0.0,1952,0.0,98166,47.4434,-122.347,2250,20023
2800031,4/1/2015,235000.0,3,1.0,1430,7599,1.5,0.0,4,6,1010,420.0,1930,0.0,98168,47.4783,-122.265,1290,10320
3600057,3/19/2015,402500.0,4,2.0,1650,3504,1.0,0.0,3,7,760,890.0,1951,2013.0,98144,47.5803,-122.294,1480,3504


- B) It looks like sqft_basement shouldn't be an object, and it would probably be more readable if price, view, and yr_renovated were integers instead of floats. I can use one function, make_ints, to make both objects and floats integers, by feeding it the list of columns I want to change.

In [16]:
cols = ['price', 'view', 'sqft_basement', 'yr_renovated']
make_ints(data,cols)

Unnamed: 0_level_0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1000102,4/22/2015,300000,6,3.00,2400,9373,2.0,0,3,7,2400,0,1991,0,98002,47.3262,-122.214,2060,7316
1200019,5/8/2014,647500,4,1.75,2060,26036,1.0,0,4,8,1160,900,1947,0,98166,47.4444,-122.351,2590,21891
1200021,8/11/2014,400000,3,1.00,1460,43000,1.0,0,3,7,1460,0,1952,0,98166,47.4434,-122.347,2250,20023
2800031,4/1/2015,235000,3,1.00,1430,7599,1.5,0,4,6,1010,420,1930,0,98168,47.4783,-122.265,1290,10320
3600057,3/19/2015,402500,4,2.00,1650,3504,1.0,0,3,7,760,890,1951,2013,98144,47.5803,-122.294,1480,3504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9842300095,7/25/2014,365000,5,2.00,1600,4168,1.5,0,3,7,1600,0,1927,0,98126,47.5297,-122.381,1190,4168
9842300485,3/11/2015,380000,2,1.00,1040,7372,1.0,0,5,7,840,200,1939,0,98126,47.5285,-122.378,1930,5150
9842300540,6/24/2014,339000,3,1.00,1100,4128,1.0,0,4,7,720,380,1942,0,98126,47.5296,-122.379,1510,4538
9895000040,7/3/2014,399900,2,1.75,1410,1005,1.5,0,3,9,900,510,2011,0,98027,47.5446,-122.018,1440,1188


#### 13) Now that the data is cleaned up a bit, I want to create a few more features. 
- A) I will use 'basement' to indicate 0 for no basement and 1 for basement, 
- B) 'reno' to indicate 0 for renovation and 1 for no renovation. 
- C) 'yrs_since_reno' to indicate the when the renovation was. Houses without rennovation data will read the year built.
- C) I will also include another column called 'yr_sold' to isolate the year sold as an integer so I can create another column called 'age_at_sale' that indicate the age of the home on the sale date.

In [17]:
data['basement'] = data.sqft_basement.apply(lambda x: 1 if x > 0 else 0)
data['reno'] = data.yr_renovated.apply(lambda x: 1 if x > 0 else 0)
data['yrs_since_reno'] = abs(data.yr_renovated-data.yr_built)
data['yr_sold'] = data.date.str[4:].str.replace(r'\/','').astype(int)
data['age_at_sale'] = data.yr_sold-data.yr_built

14) I will arrange the columns in a more readable way by putting like-items near each other.

In [18]:
x = ['price', 'date', 'yr_sold', 'yr_built', 'age_at_sale', 'reno', 'yr_renovated', 'yrs_since_reno', 'condition', 'grade', 'view', 'floors', 'bedrooms', 'bathrooms', 'basement', 'sqft_lot', 'sqft_living', 'sqft_basement', 'sqft_above', 'sqft_living15', 'sqft_lot15', 'lat', 'long', 'zipcode']
data.reindex(columns=x)

Unnamed: 0_level_0,price,date,yr_sold,yr_built,age_at_sale,reno,yr_renovated,yrs_since_reno,condition,grade,...,basement,sqft_lot,sqft_living,sqft_basement,sqft_above,sqft_living15,sqft_lot15,lat,long,zipcode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000102,300000,4/22/2015,2015,1991,24,0,0,1991,3,7,...,0,9373,2400,0,2400,2060,7316,47.3262,-122.214,98002
1200019,647500,5/8/2014,2014,1947,67,0,0,1947,4,8,...,1,26036,2060,900,1160,2590,21891,47.4444,-122.351,98166
1200021,400000,8/11/2014,2014,1952,62,0,0,1952,3,7,...,0,43000,1460,0,1460,2250,20023,47.4434,-122.347,98166
2800031,235000,4/1/2015,2015,1930,85,0,0,1930,4,6,...,1,7599,1430,420,1010,1290,10320,47.4783,-122.265,98168
3600057,402500,3/19/2015,2015,1951,64,1,2013,62,3,7,...,1,3504,1650,890,760,1480,3504,47.5803,-122.294,98144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9842300095,365000,7/25/2014,2014,1927,87,0,0,1927,3,7,...,0,4168,1600,0,1600,1190,4168,47.5297,-122.381,98126
9842300485,380000,3/11/2015,2015,1939,76,0,0,1939,5,7,...,1,7372,1040,200,840,1930,5150,47.5285,-122.378,98126
9842300540,339000,6/24/2014,2014,1942,72,0,0,1942,4,7,...,1,4128,1100,380,720,1510,4538,47.5296,-122.379,98126
9895000040,399900,7/3/2014,2014,2011,3,0,0,2011,3,9,...,1,1005,1410,510,900,1440,1188,47.5446,-122.018,98027


In [19]:
data.to_csv('kc_re_data.csv')