## Data Extract, Transform, Load

In [897]:
# If you ever want to reset all your variables, you can use %reset -f by uncommenting and running the next line
#%reset -f

In [898]:
# Import Dependencies
import pandas as pd
import numpy as np

### Clean Public Record Data

In [899]:
# Read in file 
Pub_Rec_df = pd.read_csv("Resources_raw/Pub_Rec.csv")
#Pub_Rec_df

In [900]:
# Drop NaN columns and rows
Pub_Rec_df = Pub_Rec_df.dropna(axis=1,how='all').dropna(axis=0,how='all')
#Pub_Rec_df

In [901]:
# Move foreign key (TaxID) to second column
second_column = Pub_Rec_df.pop("TaxID")
Pub_Rec_df.insert(1,"TaxID", second_column)
#Pub_Rec_df

In [902]:
# Check data types
Pub_Rec_df.dtypes

MLS Number                   object
TaxID                        object
PropertyAddressFormatted     object
PropertyCityState            object
Zipcode                       int64
Zip4                        float64
CarrierRoute                 object
PropDoNotMail                object
OwnerNames                   object
OwnerLastName                object
OwnerFirstName               object
Owner2LastName               object
Owner2FirstName              object
Owner3LastName               object
Owner3FirstName              object
OwnerAddress                 object
OwnerCityState               object
OwnerZipCode                float64
OwnerZip4                     int64
OwnerCarrierRoute            object
OwnerOccupied                  bool
Municipality                 object
SubdivisionNeighborhood      object
TaxIDAlt                     object
TaxMap                      float64
Block                        object
Lot                          object
SchoolDistrict              

In [903]:
# Convert data types, rename columns and reformat as needed, 
Pub_Rec_df.rename(columns = {'MLS Number':'MLSNumber'}, inplace=True)
Pub_Rec_df.rename(columns = {'PropertyAddressFormatted':'Address-truncated'}, inplace=True)
Pub_Rec_df.rename(columns = {'TaxID':'Tax_ID'}, inplace=True)
Pub_Rec_df['Zipcode'] = Pub_Rec_df['Zipcode'].astype(str)
Pub_Rec_df.rename(columns = {'Zipcode':'Zip_Code'}, inplace=True)
Pub_Rec_df["Zip4"] = Pub_Rec_df["Zip4"].map("{:.0f}".format)
Pub_Rec_df['Zip4'] = Pub_Rec_df['Zip4'].astype(str)
Pub_Rec_df['OwnerZipCode'] = Pub_Rec_df['OwnerZipCode'].astype(str)
Pub_Rec_df['OwnerZip4'] = Pub_Rec_df['OwnerZip4'].astype(str)
Pub_Rec_df['TaxMap'] = Pub_Rec_df['TaxMap'].astype(str)
Pub_Rec_df["DeedRecordDate"] =  pd.to_datetime(Pub_Rec_df["DeedRecordDate"], format="%m/%d/%y")
Pub_Rec_df["SettleDate"] =  pd.to_datetime(Pub_Rec_df["SettleDate"], format="%m/%d/%y")
Pub_Rec_df["YearBuilt"] = Pub_Rec_df["YearBuilt"].map("{:.0f}".format)
Pub_Rec_df["YearRemod"] = Pub_Rec_df["YearRemod"].map("{:.0f}".format)
Pub_Rec_df.dtypes

MLSNumber                          object
Tax_ID                             object
Address-truncated                  object
PropertyCityState                  object
Zip_Code                           object
Zip4                               object
CarrierRoute                       object
PropDoNotMail                      object
OwnerNames                         object
OwnerLastName                      object
OwnerFirstName                     object
Owner2LastName                     object
Owner2FirstName                    object
Owner3LastName                     object
Owner3FirstName                    object
OwnerAddress                       object
OwnerCityState                     object
OwnerZipCode                       object
OwnerZip4                          object
OwnerCarrierRoute                  object
OwnerOccupied                        bool
Municipality                       object
SubdivisionNeighborhood            object
TaxIDAlt                          

In [904]:
# Show clean dataframe
Pub_Rec_df

Unnamed: 0,MLSNumber,Tax_ID,Address-truncated,PropertyCityState,Zip_Code,Zip4,CarrierRoute,PropDoNotMail,OwnerNames,OwnerLastName,...,BldgSqFtTotal,Stories,Bedrooms,Exterior,BsmtDesc,FireplaceTotal,GrgType,HeatDelivery,YearBuilt,YearRemod
0,DENC518086,08-038.30-119,2615 Pecksniff,"Wilmington, DE",19808,3026,C010,N,James Robinson,Robinson,...,1875.0,1.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,Att/BuiltIn/Bsmt,Hot Water/Steam,1958,0
1,DENC518982,08-036.10-081,4938 S Tupelo,"Wilmington, DE",19808,1026,C009,N,Xiaopeng Deng,Deng,...,1575.0,2.0,3.0,"Brick, Aluminum, Vinyl",Finished,0.0,,Hot/Warm Air,1976,0
2,DENC512992,08-044.30-363,15 Kristina,"Wilmington, DE",19808,4063,C084,N,Robert F Walls,Walls,...,,2.0,2.0,"Aluminum, Vinyl",,0.0,,Heat Pump,1985,0
3,DENC512104,08-036.40-376,3251 Champions,"Wilmington, DE",19808,2601,C039,N,Michael J Downs,Downs,...,,2.0,2.0,Other,,1.0,Att/BuiltIn/Bsmt,Heat Pump,1985,0
4,DENC503480,07-041.10-071,3706 Lafayette,"Wilmington, DE",19808,6014,C001,N,Maria Corona,Corona,...,,1.0,3.0,Asbestos,,0.0,,Hot/Warm Air,1957,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,08-020.40-046,101 Meriden,"Hockessin, DE",19707,1704,R004,N,Michael J N Generations,Generations,...,2925.0,1.0,3.0,"Frame, Siding - Wood, Stone",,1.0,,Hot/Warm Air,1958,0
5572,DENC520392,07-010.00-054,417 Snuff Mill Hill,"Hockessin, DE",19707,9638,R002,N,Jacqueline G Truluck & Breck L Robinson,Truluck,...,3700.0,2.0,4.0,"Siding - Wood, Frame",Finished,2.0,Att/BuiltIn/Bsmt,Heat Pump,1973,0
5573,DENC2014038,08-012.10-027,400 Wilson,"Hockessin, DE",19707,9207,R001,N,Kevin K Thomas,Thomas,...,2850.0,2.0,4.0,"Brick, Aluminum, Vinyl",Finished,1.0,Att/BuiltIn/Bsmt,Hot/Warm Air,1989,0
5574,DENC2005484,08-004.30-016,918 Old Public,"Hockessin, DE",19707,9679,R011,N,Lawrence P Murphy,Murphy,...,,2.0,4.0,"Aluminum, Vinyl, Brick",,1.0,Att/BuiltIn/Bsmt,Hot/Warm Air,1992,0


### Clean Broker Data

In [905]:
# Read in file
broker_df = pd.read_csv("Resources_raw/Broker_Data.csv")
#broker_df.head()

In [906]:
# Drop unnecssary columns and NaN rows and columns: show clean dataframe
broker_df = broker_df.drop(columns=['ListOfficePhone','ListAgentPhone','ListAgentEmail','AgentRemarks','Showing Instructions'])\
    .dropna(axis=1,how='all').dropna(axis=0,how='all')
broker_df

Unnamed: 0,MLSNumber,Address,ListOfficeName,ListAgentName
0,DENC518086,2615 Pecksniff Rd,Patterson-Schwartz-Middletown,Dianne Platt
1,DENC518982,4938 S Tupelo Turn,"Long & Foster Real Estate, Inc.",Andrew White
2,DENC512992,15 Kristina Ct,"Long & Foster Real Estate, Inc.",Diane W Bacigalupi
3,DENC512104,3251 Champions Dr,"Long & Foster Real Estate, Inc.",Barbara Carpenter
4,DENC503480,3706 Lafayette St,Meyer & Meyer Realty,Peggy Cushing
...,...,...,...,...
5571,DENC520114,101 Meriden Dr,BHHS Fox & Roach-Kennett Sq,Joelle C Waterkotte
5572,DENC520392,417 Snuff Mill Hill Rd,BHHS Fox & Roach-Greenville,Sharon Satterfield
5573,DENC2014038,400 Wilson Ct,BHHS Fox & Roach - Hockessin,Jeff Bollinger
5574,DENC2005484,918 Old Public Rd,"EXP Realty, LLC",Andrew Joseph Szczerba


In [907]:
broker_df.dtypes

MLSNumber         object
Address           object
ListOfficeName    object
ListAgentName     object
dtype: object

### Clean ID_Table

In [908]:
# Read in file
ID_table_df = pd.read_csv("Resources_raw/ID_table.csv")
#ID_table_df

In [909]:
# Delete unnessary columns and drop NaN rows and columns, reformat as needed
ID_table_df = ID_table_df.drop(columns=['Township','OriginatingMLS ','CrossStreet', 'FloorNumber','Ownership','Senior Community YN','Condo/Coop Assoc YN','HOA YN','AssociationFee','AssociationFeeFrequency','Structure Type ','StreetNumber','StreetName','UnitNumber'])\
     .dropna(axis=1,how='all').dropna(axis=0,how='all')
ID_table_df["Zip Code"] = ID_table_df["Zip Code"].map("{:.0f}".format)
ID_table_df.rename(columns = {'Zip Code':'Zip_Code'}, inplace=True)
ID_table_df.rename(columns = {'School District':'School_District'}, inplace=True)
#ID_table_df

In [910]:
# Check data types
#ID_table_df.dtypes

In [911]:
# Convert data types as needed
ID_table_df['Zip_Code'] = ID_table_df['Zip_Code'].astype(str)
ID_table_df['MLSArea'] = ID_table_df['MLSArea'].astype(str)
ID_table_df.dtypes

MLSNumber             object
Address               object
Category              object
City                  object
State                 object
Zip_Code              object
County                object
MLSArea               object
Subdivision           object
School_District       object
Schools-Elementary    object
Schools-Middle        object
Schools-HighSchool    object
dtype: object

In [912]:
# Show clean datframe
ID_table_df

Unnamed: 0,MLSNumber,Address,Category,City,State,Zip_Code,County,MLSArea,Subdivision,School_District,Schools-Elementary,Schools-Middle,Schools-HighSchool
0,DENC518086,2615 Pecksniff Rd,RES,Wilmingon,DE,19808,NEWCASTLEDE,30903.0,SHERWOOD PARK II,Red Clay Consolidated,,,
1,DENC518982,4938 S Tupelo Turn,RES,Wilmington,DE,19808,NEWCASTLEDE,30903.0,PEPPER RIDGE,Red Clay Consolidated,,,
2,DENC512992,15 Kristina Ct,RES,Wilmington,DE,19808,NEWCASTLEDE,30903.0,WOODMILL,Red Clay Consolidated,,,
3,DENC512104,3251 Champions Dr,RES,Wilmington,DE,19808,NEWCASTLEDE,30903.0,FAIRWAY FALLS,Red Clay Consolidated,,,
4,DENC503480,3706 Lafayette St,RES,Wilmington,DE,19808,NEWCASTLEDE,30903.0,WASHINGTON HEIGHTS,Red Clay Consolidated,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,101 Meriden Dr,RES,Hockessin,DE,19707,NEWCASTLEDE,30902.0,CANTERBURY HILLS,Red Clay Consolidated,,,
5572,DENC520392,417 Snuff Mill Hill Rd,RES,Hockessin,DE,19707,NEWCASTLEDE,30902.0,SNUFF MILL,Red Clay Consolidated,Brandywine Springs,Henry B. Du Pont,Alexis I. Dupont
5573,DENC2014038,400 Wilson Ct,RES,Hockessin,DE,19707,NEWCASTLEDE,30902.0,STENNING WOODS,Red Clay Consolidated,North Star,Henry B. Du Pont,Alexis I. Dupont
5574,DENC2005484,918 Old Public Rd,RES,Hockessin,DE,19707,NEWCASTLEDE,30902.0,QUAKER LEA EAST,Red Clay Consolidated,,,


### Clean Mortgage_Data

In [913]:
Mortgage_Data_df = pd.read_csv("Resources_raw/Mortgage_Data.csv")
#Mortgage_Data_df

In [914]:
Mortgage_Data_df = Mortgage_Data_df.dropna(axis=1,how='all').dropna(axis=0,how='all')
#Mortgage_Data_df

In [915]:
Mortgage_Data_df.rename(columns = {'Tax ID':'Tax_ID'}, inplace=True)

In [916]:
# Convert data types, rename columns and reformat as needed, 
Mortgage_Data_df.rename(columns = {'Tax ID':'Tax_ID'}, inplace=True)
Mortgage_Data_df.rename(columns = {'Mort Amt':'Mort_Amt'}, inplace=True)
Mortgage_Data_df.rename(columns = {'Mortgage Type':'Mort_Type'}, inplace=True)
Mortgage_Data_df.rename(columns = {'Mort Int Rate':'Mort_Int_Rate'}, inplace=True)
Mortgage_Data_df.rename(columns = {'Mort Term':'Mort_Term'}, inplace=True)
Mortgage_Data_df.rename(columns = {'Mort Record Date':'Mort_Record_Date'}, inplace=True)
Mortgage_Data_df.rename(columns = {'Mort Due Date':'Mort_Due_Date'}, inplace=True)
Mortgage_Data_df.rename(columns = {'Mort Date':'Mort_Date'}, inplace=True)



In [917]:
# Check data types
Mortgage_Data_df.dtypes

Tax_ID               object
Address              object
Lender               object
Mort_Amt            float64
Mort_Type            object
Mort_Int_Rate       float64
Mort_Term            object
Mort_Record_Date     object
Mort_Due_Date        object
Mort_Date            object
dtype: object

In [918]:
# Show clean datframe
Mortgage_Data_df

Unnamed: 0,Tax_ID,Address,Lender,Mort_Amt,Mort_Type,Mort_Int_Rate,Mort_Term,Mort_Record_Date,Mort_Due_Date,Mort_Date
0,06-004.00-004.C.010,108 N Landis Way Unit #108,AMERICAN NBRHD MTG ACCPTNC AMERICAN NBRHD MTG ...,388000.0,"Conv,HomeEquityLoan",0.0,15,3/16/22,4/1/37,3/11/22
1,06-004.00-004.C.014,140 N Landis Way N Unit #140,WSFS MTG WSFS MTG,420000.0,"Conv,HomeEquityLoan",0.0,30,8/24/21,9/1/51,8/17/21
2,06-004.00-004.C.021,216 S Landis Way S Unit #216,WSFS MTG,220000.0,Conv,0.0,30,11/15/16,12/1/46,11/10/16
3,06-004.00-004.C.033,339 Cassell Ct Unit #339,RBS CITIZENS NA,165000.0,"Conv,HomeEquityLoan",0.0,25,10/17/14,10/10/39,10/6/14
4,06-004.00-004.C.034,346 Cassell Ct Unit #346,TRIDENT MTG CO LP TRIDENT MTG CO LP,297500.0,Conv,0.0,15,1/7/21,1/1/36,12/8/20
...,...,...,...,...,...,...,...,...,...,...
2112,26-050.10-068.C.200,105-UNIT Christina Landing Dr Unit,HUNTINGDON VLY BK HUNTINGDON VLY BK,244250.0,"Conv,Refinance",0.0,15,5/11/20,6/1/35,4/30/20
2113,26-050.10-068.C.240,105-UNIT Christina Landing Dr Unit,PROSPERITY HM MTG LLC PROSPERITY HM MTG LLC,310000.0,Conv,0.0,15,5/5/22,6/1/37,5/3/22
2114,26-050.10-068.C.250,105-UNIT Christina Landing Dr Unit,NAVY FCU NAVY FCU,473500.0,Conv,0.0,30,3/4/21,3/1/51,2/26/21
2115,26-051.30-160,1215 Apple St,BROADVIEW CAP LLC BROADVIEW CAP LLC,492000.0,"BlanketMortgage,ConstructionL",0.0,1,1/20/22,1/1/23,12/22/21


### Clean Sales_Data

In [919]:
# Read in file
Sales_Data_df = pd.read_csv("Resources_raw/Sales_Data.csv")
#Sales_Data_df

In [920]:
# Drop NaN columns and rows
Sales_Data_df = Sales_Data_df.dropna(axis=1,how='all').dropna(axis=0,how='all')
#Sales_Data_df

In [921]:
# Check data types
#Sales_Data_df.dtypes

In [922]:
# Convert data types, rename columns and reformat as needed, 
Sales_Data_df["ListDate"] =  pd.to_datetime(Sales_Data_df["ListDate"], format="%m/%d/%y")
Sales_Data_df["StatusDate"] =  pd.to_datetime(Sales_Data_df["StatusDate"], format="%m/%d/%y")
Sales_Data_df["Agreement of Sale/Signed Lease Date"] =  pd.to_datetime(Sales_Data_df["Agreement of Sale/Signed Lease Date"], format="%m/%d/%y")
Sales_Data_df["SettledDate"] =  pd.to_datetime(Sales_Data_df["SettledDate"], format="%m/%d/%y")
Sales_Data_df.rename(columns = {'Sold Price':'Sold_Price'}, inplace=True)
Sales_Data_df.rename(columns = {'Sold Price less Concession':'Sold_Price_less_Concession'}, inplace=True)
Sales_Data_df.rename(columns = {'Orig List Price':'Orig_List_Price'}, inplace=True)
Sales_Data_df.rename(columns = {'Current List Price':'Current_List_Price'}, inplace=True)
Sales_Data_df.rename(columns = {'Days on Market':'Days_on_Market'}, inplace=True)
Sales_Data_df.rename(columns = {'Previous Days on Market':'Previous_Days_on_Market'}, inplace=True)
Sales_Data_df.rename(columns = {'Agreement of Sale/Signed Lease Date':'Agreement_of_Sale_Date'}, inplace=True)
Sales_Data_df.rename(columns = {'Concessions YN':'Concessions_YN'}, inplace=True)
Sales_Data_df.rename(columns = {'Concessions Remarks':'Concessions_Remarks'}, inplace=True)
Sales_Data_df.dtypes

MLSNumber                             object
Address                               object
Status                                object
Sold_Price                           float64
Sold_Price_less_Concession           float64
Orig_List_Price                      float64
Current_List_Price                   float64
Days_on_Market                       float64
Previous_Days_on_Market              float64
ListDate                      datetime64[ns]
StatusDate                    datetime64[ns]
Agreement_of_Sale_Date        datetime64[ns]
SettledDate                   datetime64[ns]
Concessions_YN                        object
Concessions_Remarks                   object
SellerConcessionsAmount              float64
FinalFinancing                        object
dtype: object

In [923]:
# Show clean datframe
Sales_Data_df

Unnamed: 0,MLSNumber,Address,Status,Sold_Price,Sold_Price_less_Concession,Orig_List_Price,Current_List_Price,Days_on_Market,Previous_Days_on_Market,ListDate,StatusDate,Agreement_of_Sale_Date,SettledDate,Concessions_YN,Concessions_Remarks,SellerConcessionsAmount,FinalFinancing
0,DENC518086,2615 Pecksniff Rd,Closed,335000.0,335000.0,330000.0,330000.0,5.0,5.0,2020-12-11,2021-02-11,2020-12-16,2021-02-11,No,,0.0,FHA
1,DENC518982,4938 S Tupelo Turn,Closed,200000.0,200000.0,215500.0,210000.0,47.0,67.0,2021-01-07,2021-04-08,2021-02-21,2021-04-08,No,,0.0,Conventional
2,DENC512992,15 Kristina Ct,Closed,200000.0,200000.0,200000.0,200000.0,15.0,15.0,2020-12-28,2021-02-26,2021-01-09,2021-02-26,No,,0.0,Conventional
3,DENC512104,3251 Champions Dr,Closed,200000.0,200000.0,219900.0,214900.0,11.0,11.0,2020-10-27,2020-12-14,2020-11-05,2020-12-11,No,,0.0,Conventional
4,DENC503480,3706 Lafayette St,Closed,200000.0,200000.0,190000.0,190000.0,3.0,3.0,2020-06-20,2020-08-02,2020-06-21,2020-07-31,No,,0.0,Conventional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,101 Meriden Dr,Closed,595000.0,595000.0,595000.0,595000.0,5.0,5.0,2021-06-24,2021-07-28,2021-06-28,2021-07-28,No,,0.0,Cash
5572,DENC520392,417 Snuff Mill Hill Rd,Closed,595000.0,591262.0,639000.0,639000.0,25.0,25.0,2021-03-29,2021-06-10,2021-04-21,2021-06-10,Yes,,3738.0,Conventional
5573,DENC2014038,400 Wilson Ct,Closed,600000.0,600000.0,529900.0,529900.0,7.0,7.0,2022-01-14,2022-03-04,2022-01-19,2022-03-04,No,,0.0,Conventional
5574,DENC2005484,918 Old Public Rd,Closed,600000.0,598000.0,620000.0,620000.0,25.0,25.0,2021-08-26,2021-10-27,2021-09-18,2021-10-26,Yes,Inspections credit,2000.0,Conventional


### Clean Public Remarks

In [924]:
# Read in file
Pub_Remarks_df = pd.read_csv("Resources_raw/Pub_Remarks.csv")
Pub_Remarks_df

Unnamed: 0,MLSNumber,Address,PublicRemarks
0,DENC518086,2615 Pecksniff Rd,Visit this home virtually: http://www.vht.com/...
1,DENC518982,4938 S Tupelo Turn,"3 bedroom, 1.5 bath townhome located in the he..."
2,DENC512992,15 Kristina Ct,"Location, Location, Location! This Woodmill to..."
3,DENC512104,3251 Champions Dr,"Move right into this 2 bedroom, 2.1 bath townh..."
4,DENC503480,3706 Lafayette St,This nicely maintained home is being sold to s...
...,...,...,...
5571,DENC520114,101 Meriden Dr,"Welcome to 101 Meriden Drive! This sleek, upda..."
5572,DENC520392,417 Snuff Mill Hill Rd,"When asked what they loved about their home, t..."
5573,DENC2014038,400 Wilson Ct,Impeccably maintained cul-de-sac property in t...
5574,DENC2005484,918 Old Public Rd,"Impeccably maintained, custom built Colonial w..."


In [925]:
Pub_Remarks_df.dtypes

MLSNumber        object
Address          object
PublicRemarks    object
dtype: object

### Clean Property Characteristics

In [926]:
# Read in file
Prop_Charac_df = pd.read_csv("Resources_raw/Prop_Charac.csv")
#Prop_Charac_df

In [927]:
# Drop NaN columns and rows, rename as needed
Prop_Charac_df = Prop_Charac_df.dropna(axis=1,how='all').dropna(axis=0,how='all')
Prop_Charac_df.rename(columns = {'Senior Community YN':'Senior_Community_YN'}, inplace=True)
Prop_Charac_df.rename(columns = {'Condo/Coop Assoc YN':'Condo/Coop_Assoc_YN'}, inplace=True)
Prop_Charac_df.rename(columns = {'HOA YN':'HOA_YN'}, inplace=True)
Prop_Charac_df.rename(columns = {'Structure Type ':'Structure_Type'}, inplace=True)
Prop_Charac_df.rename(columns = {'Interior SqFt Source':'Interior_SqFt_Source'}, inplace=True)
Prop_Charac_df.rename(columns = {'Central Air YN':'Central_Air_YN'}, inplace=True)
Prop_Charac_df.rename(columns = {'Fireplace YN':'Fireplace_YN'}, inplace=True)
Prop_Charac_df.rename(columns = {'Basement YN':'Basement_YN'}, inplace=True)
Prop_Charac_df.rename(columns = {'Garage YN':'Garage_YN'}, inplace=True)
Prop_Charac_df.rename(columns = {'Main Roof':'Main_Roof'}, inplace=True)
#Prop_Charac_df

In [928]:
# Check data types
Prop_Charac_df.dtypes

MLSNumber                     object
Address                       object
BuildingName                  object
Ownership                     object
Senior_Community_YN           object
Condo/Coop_Assoc_YN           object
HOA_YN                        object
AssociationFee                object
AssociationFeeFrequency       object
Structure_Type                object
Acres                        float64
LotDimensions                 object
LotDescription                object
FeeIncludes                   object
Age                          float64
InteriorSqFt                 float64
Interior_SqFt_Source          object
AboveGradeSqFt               float64
BelowGradeSqFt               float64
PropertyCondition             object
Bedrooms                     float64
Baths                        float64
BathsFull                    float64
PartialBaths                 float64
Design                        object
Style                         object
NumberofStories               object
R

In [929]:
# Show clean datframe
Prop_Charac_df

Unnamed: 0,MLSNumber,Address,BuildingName,Ownership,Senior_Community_YN,Condo/Coop_Assoc_YN,HOA_YN,AssociationFee,AssociationFeeFrequency,Structure_Type,...,Garage_YN,GarageSpaces,GarageFeatures,Parking,ExteriorFeatures,ExteriorMaterial,Main_Roof,Foundation,PorchDeck,SwimmingPoolType
0,DENC518086,2615 Pecksniff Rd,NONE AVAILABLE,FeeSimple,No,No,Yes,$15,Annually,Detached,...,Yes,1.0,,,"ExtensiveHardscape,Sidewalks,StoneRetainingWal...","BrickFront,VinylSiding",ArchitecturalShingle,,"Patios,Porches",
1,DENC518982,4938 S Tupelo Turn,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,"AluminumSiding,Brick,VinylSiding",Asphalt,,,
2,DENC512992,15 Kristina Ct,,FeeSimple,No,No,No,,,Interior Row/Townhouse,...,No,,,,,VinylSiding,,,,
3,DENC512104,3251 Champions Dr,,FeeSimple,No,No,Yes,$50,Annually,Interior Row/Townhouse,...,Yes,1.0,,,,Other,Shingle,,Patios,
4,DENC503480,3706 Lafayette St,,FeeSimple,No,No,No,,,Detached,...,No,,,,,Asbestos,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,DENC520114,101 Meriden Dr,,FeeSimple,No,No,Yes,$500,Annually,Detached,...,Yes,2.0,AsphaltDriveway,AsphaltDriveway,"ExtensiveHardscape,StoneRetainingWalls","Stone,VinylSiding","Pitched,Shingle",,"Brick,Patios,Porches",
5572,DENC520392,417 Snuff Mill Hill Rd,,FeeSimple,No,No,Yes,$,Unknown,Detached,...,Yes,2.0,,,"ExtensiveHardscape,ExteriorLighting,FloodLight...","Cedar,Combination,Mixed,MixedPlumbing","Flat,Shake,Shingle",,"Decks,Porches,Screened,WrapAround",
5573,DENC2014038,400 Wilson Ct,,FeeSimple,No,No,No,,,Detached,...,Yes,2.0,,,,"AluminumSiding,Brick,VinylSiding",,ConcretePerimeter,,
5574,DENC2005484,918 Old Public Rd,,FeeSimple,No,No,No,,,Detached,...,Yes,2.0,,,"Awnings,ExteriorLighting,LawnSprinkler",Brick,,,Decks,


### Clean Lat_Lng Data

In [930]:
# Read in file
lat_lng_df = pd.read_csv("Resources_raw/lat_lng_full.csv")
lat_lng_df

Unnamed: 0,MLS Number,address_new,lat,lng
0,DENC518086,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
1,DENC518086,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
2,DENC2018974,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
3,DENC2018974,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
4,DENC518982,"4938 S Tupelo Wilmington, DE",39.744655,-75.548391
...,...,...,...,...
5745,DENC520114,"101 Meriden Hockessin, DE",39.763823,-75.652016
5746,DENC520392,"417 Snuff Mill Hill Hockessin, DE",39.809390,-75.667333
5747,DENC2014038,"400 Wilson Hockessin, DE",39.780478,-75.717182
5748,DENC2005484,"918 Old Public Hockessin, DE",39.795708,-75.688011


In [931]:
# Drop Duplicates
#lat_lng_df.sort_values("MLS Number", inplace=True)
lat_lng_df.drop_duplicates("MLS Number", keep="first", inplace=True)
lat_lng_df.rename(columns = {'MLS Number':'MLSNumber'}, inplace=True)
lat_lng_df

Unnamed: 0,MLSNumber,address_new,lat,lng
0,DENC518086,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
2,DENC2018974,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
4,DENC518982,"4938 S Tupelo Wilmington, DE",39.744655,-75.548391
5,DENC512992,"15 Kristina Wilmington, DE",39.744655,-75.548391
6,DENC512104,"3251 Champions Wilmington, DE",39.729664,-75.698656
...,...,...,...,...
5745,DENC520114,"101 Meriden Hockessin, DE",39.763823,-75.652016
5746,DENC520392,"417 Snuff Mill Hill Hockessin, DE",39.809390,-75.667333
5747,DENC2014038,"400 Wilson Hockessin, DE",39.780478,-75.717182
5748,DENC2005484,"918 Old Public Hockessin, DE",39.795708,-75.688011


In [932]:
# Check data types
lat_lng_df.dtypes

MLSNumber       object
address_new     object
lat            float64
lng            float64
dtype: object

In [933]:
# Show clean datframe
lat_lng_df

Unnamed: 0,MLSNumber,address_new,lat,lng
0,DENC518086,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
2,DENC2018974,"2615 Pecksniff Wilmington, DE",39.734608,-75.661402
4,DENC518982,"4938 S Tupelo Wilmington, DE",39.744655,-75.548391
5,DENC512992,"15 Kristina Wilmington, DE",39.744655,-75.548391
6,DENC512104,"3251 Champions Wilmington, DE",39.729664,-75.698656
...,...,...,...,...
5745,DENC520114,"101 Meriden Hockessin, DE",39.763823,-75.652016
5746,DENC520392,"417 Snuff Mill Hill Hockessin, DE",39.809390,-75.667333
5747,DENC2014038,"400 Wilson Hockessin, DE",39.780478,-75.717182
5748,DENC2005484,"918 Old Public Hockessin, DE",39.795708,-75.688011


### Output Clean Files

In [934]:
# Output Clean Files 
Pub_Rec_df.to_csv("Resources/pub_rec_clean.csv", index=False)
broker_df.to_csv("Resources/broker_data_clean.csv", index=False)
ID_table_df.to_csv("Resources/id_table_clean.csv", index=False)
Mortgage_Data_df.to_csv("Resources/mortgage_data_clean.csv", index=False)
Sales_Data_df.to_csv("Resources/sales_data_clean.csv", index=False)
# Exclude Pub_remarks.csv - data is unstructured
#Pub_Remarks_df.to_csv("Resources/pub_remarks_clean.csv", index=False)
Prop_Charac_df.to_csv("Resources/prop_charac_clean.csv", index=False)
lat_lng_df.to_csv("Resources/lat_lng_clean.csv", index=False)