In [1]:
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
## Importing ledger to inspect data in columns within each dataset.

In [3]:
key = pd.read_csv("../../data/raw/EXTR_LookUp.csv")

In [4]:
key[key.LUType==58].sort_values(by='LUItem')

Unnamed: 0,LUType,LUItem,LUDescription
997,58,1,FAIR ...
998,58,2,AVERAGE ...
999,58,3,GOOD ...
1000,58,4,EXCELLENT ...


In [5]:
## Importing datasets and keeping running record of columns to drop later.

In [6]:
parcel_recs = pd.read_csv("../../data/raw/EXTR_Parcel.csv", dtype=str)
drop_later1 = ['Unnamed: 0', 'PropName', 'PlatName', 'Range', 'Section','QuarterSection',
               'PropType', 'SubArea', 'SpecArea','SpecSubArea','DistrictName', 'LevyCode',
               'CurrentZoning','HBUAsIfVacant','HBUAsImproved', 'PresentUse', 'WaterSystem',
               'SewerSystem','Access', 'Topography', 'StreetSurface','RestrictiveSzShape',
               'InadequateParking','Unbuildable','WfntLocation', 'WfntBank','WfntPoorQuality',
               'WfntRestrictedAccess','WfntProximityInfluence','LotDepthFactor','PowerLines',
               'OtherNuisances','NbrBldgSites','Contamination','DNRLease','AdjacentGolfFairway',
               'HistoricSite','CurrentUseDesignation','NativeGrowthProtEsmt','Easements',
               'OtherDesignation','DeedRestrictions', 'DevelopmentRightsPurch','CoalMineHazard',
               'CriticalDrainage', 'ErosionHazard', 'LandfillBuffer','HundredYrFloodPlain',
               'SeismicHazard', 'LandslideHazard','SteepSlopeHazard', 'Stream', 'Wetland',
               'SpeciesOfConcern','SensitiveAreaTract', 'WaterProblems', 'TranspConcurrency',
               'OtherProblems', 'WfntAccessRights']

In [7]:
res_build_recs = pd.read_csv("../../data/raw/EXTR_ResBldg.csv", dtype=str)
drop_later2 = ['StreetType','Stories','BldgNbr','BuildingNumber',
               'Obsolescence','PcntNetCondition','NbrLivingUnits',
               'Address','BuildingNumber','Fraction','DirectionPrefix',
               'StreetName','StreetType','DirectionSuffix','ZipCode',
               'Stories','BldgGradeVar','Obsolescence', 
               'SqFt1stFloor','SqFtHalfFloor','SqFt2ndFloor',
               'SqFtUpperFloor']

In [8]:
rp_sales_recs = pd.read_csv("../../data/raw/EXTR_RPSale.csv", dtype=str)
drop_later3 = ['ExciseTaxNbr','DocumentDate','RecordingNbr','Volume', 
               'Page', 'PrincipalUse','AFNonProfitUse', 'PlatNbr',
               'PlatType','AFForestLand','AFHistoricProperty',
               'AFCurrentUseLand','SellerName','BuyerName',
               'SaleInstrument','PropertyType','PropertyClass',
               'SaleReason']

In [9]:
# Creating unique PIN number for each record to be used joining later.

In [10]:
source_pin = ['Major','Minor']

In [11]:
parcel_recs['PIN'] = parcel_recs[source_pin[0]
                                ]+parcel_recs[source_pin[1]]

In [12]:
res_build_recs['PIN'] = res_build_recs[source_pin[0]
                                      ]+res_build_recs[source_pin[1]]

In [13]:
rp_sales_recs['PIN'] = rp_sales_recs[source_pin[0]
                                    ]+rp_sales_recs[source_pin[1]]

In [14]:
# Checking for and converting to numerical values. 

In [15]:
parcel_recs.nunique()

Unnamed: 0                205199
Major                      12226
Minor                       6060
PropName                   10056
PlatName                   11028
PlatLot                     4931
PlatBlock                    726
Range                         13
Township                       9
Section                       37
QuarterSection                 5
PropType                       7
Area                          93
SubArea                       44
SpecArea                      17
SpecSubArea                  132
DistrictName                  40
LevyCode                     470
CurrentZoning                742
HBUAsIfVacant                 22
HBUAsImproved                  5
PresentUse                   116
SqFtLot                    35710
WaterSystem                    5
SewerSystem                    5
Access                         6
Topography                     2
StreetSurface                  5
RestrictiveSzShape             2
InadequateParking              3
PcntUnusab

In [16]:
res_build_recs.nunique()

Major                  11239
Minor                   5720
BldgNbr                   21
NbrLivingUnits             5
Address               178921
BuildingNumber         26978
Fraction                  43
DirectionPrefix            9
StreetName              2342
StreetType                24
DirectionSuffix           10
ZipCode                  157
Stories                    7
BldgGrade                 14
BldgGradeVar               4
SqFt1stFloor            2089
SqFtHalfFloor            396
SqFt2ndFloor            1638
SqFtUpperFloor           262
SqFtUnfinFull            128
SqFtUnfinHalf             94
SqFtTotLiving           3232
SqFtTotBasement         1053
SqFtFinBasement          909
FinBasementGrade          15
SqFtGarageBasement       425
SqFtGarageAttached       869
DaylightBasement           5
SqFtOpenPorch            673
SqFtEnclosedPorch        225
SqFtDeck                 799
HeatSystem                 9
HeatSource                 8
BrickStone                94
ViewUtilizatio

In [17]:
rp_sales_recs.nunique()

ExciseTaxNbr          331698
Major                  15346
Minor                   6076
DocumentDate            1981
SalePrice              29381
RecordingNbr          306847
Volume                     1
Page                       1
PlatNbr                    1
PlatType                   1
PlatLot                    1
PlatBlock                  1
SellerName            271671
BuyerName             284701
PropertyType              80
PrincipalUse              12
SaleInstrument            25
AFForestLand               3
AFCurrentUseLand           3
AFNonProfitUse             3
AFHistoricProperty         3
SaleReason                20
PropertyClass             10
PIN                   252962
dtype: int64

In [18]:
# Investigating specific columns to verify if they are categorical or not.

In [19]:
parcel_recs.Area.unique()

array(['35', '19', '100', '1', '37', '16', '44', '62', '53', '7', '15',
       '4', '66', '67', '60', '42', '3', '96', '14', '75', '24', '72',
       '64', '27', '52', '94', '95', '25', '74', '45', '28', '51', '49',
       '50', '80', '12', '17', '46', '58', '57', '21', '32', '43', '86',
       '70', '2', '56', '71', '13', '81', '93', '38', '40', '73', '61',
       '92', '8', '82', '23', '26', '79', '48', '0', '31', '47', '20',
       '77', '55', '39', '88', '29', '30', '11', '54', '87', '34', '36',
       '33', '65', '6', '59', '69', '18', '41', '91', '68', '85', '84',
       '22', '90', '10', '63', nan, '540'], dtype=object)

In [20]:
res_build_recs.PcntNetCondition.unique()

array(['0', '3', '25', '4', '10', '2', '1', '43', '5', '20', '37', '60',
       '50', '99', '85', '40', '15', '44', '6', '35', '14', '100', '30',
       '33'], dtype=object)

In [21]:
rp_sales_recs.PrincipalUse.unique()

array(['7', '6', '2', '11', '4', '10', '0', '9', '8', '3', '5', '1'],
      dtype=object)

In [22]:
# Converting numeric data from string.

In [23]:
numeric_par = ['SqFtLot','PcntUnusable','WfntFootage']
                          
parcel_recs[numeric_par] = parcel_recs[numeric_par
                                      ].apply(pd.to_numeric,
                                              errors='coerce')
# len(parcel_recs) #205199

In [24]:
numeric_res = ['Bedrooms','BathHalfCount','Bath3qtrCount','BathFullCount','SqFt1stFloor',
               'SqFtHalfFloor','SqFt2ndFloor','SqFtUpperFloor','SqFtUnfinFull',
               'SqFtUnfinHalf','SqFtTotLiving','SqFtTotBasement','SqFtFinBasement',
               'SqFtGarageBasement','SqFtGarageAttached','SqFtOpenPorch','SqFtEnclosedPorch',
               'SqFtDeck','YrBuilt','YrRenovated','AddnlCost','PcntNetCondition',
               'FpSingleStory', 'FpMultiStory','FpFreestanding','FpSingleStory',
               'FpMultiStory', 'FpFreestanding', 'FpAdditional', 'PcntComplete'] 
res_build_recs[numeric_res] = res_build_recs[numeric_res
                                            ].apply(pd.to_numeric,
                                                    errors='coerce')
# len(res_build_recs) #181510

In [25]:
numeric_rp = ['SalePrice']
rp_sales_recs[numeric_rp] = rp_sales_recs[numeric_rp
                                         ].apply(pd.to_numeric)
# len(rp_sales_recs) #351067

In [26]:
## Creating df by combining 3 data sets.

In [27]:
df_pre = parcel_recs.merge(res_build_recs, on='PIN', how='inner')

In [28]:
df = df_pre.merge(rp_sales_recs, on='PIN', how='inner')

In [29]:
# len(df) #251300

In [30]:
df.SaleWarning.nunique()

1236

In [31]:
## Removing non-residential properties.

In [32]:
df = df[(df.PrincipalUse == '6')] #len(df) #250540

In [33]:
## Filtering SalePrice

In [34]:
df = df[(df.SalePrice > 0)] 
len(df) #173128

173128

In [35]:
# Removing for: 
# - mobile homes
# - condos with storage/garage/moorage 
# - SalePrice <1000 & full SalePrice not reported

In [36]:
df = df[(df.SaleWarning != '5') &
        (df.SaleWarning != '32') &
        (df.SaleWarning != '48') &
        (df.SaleWarning != '49') &
        (df.SaleWarning != '50')] 

In [37]:
# Filtering for homes with garages and porches.

In [38]:
df = df[(df.SqFtGarageBasement > 0) | (df.SqFtGarageAttached > 0)] 
# len(df) #121099

In [39]:
df = df[(df.SqFtOpenPorch > 0) | (df.SqFtEnclosedPorch > 0)]
#len(df) #70169

In [40]:
# Dropping residences not currently used as homes.

In [41]:
index_names1 = df[(df.PresentUse != '6') &
                  (df.PresentUse != '5') &
                  (df.PresentUse != '4') &
                  (df.PresentUse != '3') & 
                  (df.PresentUse != '2') 
                 ].index
len(index_names1) #4596

4596

In [42]:
df.head()

Unnamed: 0.1,Unnamed: 0,Major_x,Minor_x,PropName,PlatName,PlatLot_x,PlatBlock_x,Range,Township,Section,QuarterSection,PropType,Area,SubArea,SpecArea,SpecSubArea,DistrictName,LevyCode,CurrentZoning,HBUAsIfVacant,HBUAsImproved,PresentUse,SqFtLot,WaterSystem,SewerSystem,Access,Topography,StreetSurface,RestrictiveSzShape,InadequateParking,PcntUnusable,Unbuildable,MtRainier,Olympics,Cascades,Territorial,SeattleSkyline,PugetSound,LakeWashington,LakeSammamish,SmallLakeRiverCreek,OtherView,WfntLocation,WfntFootage,WfntBank,WfntPoorQuality,WfntRestrictedAccess,WfntAccessRights,WfntProximityInfluence,TidelandShoreland,LotDepthFactor,TrafficNoise,AirportNoise,PowerLines,OtherNuisances,NbrBldgSites,Contamination,DNRLease,AdjacentGolfFairway,AdjacentGreenbelt,HistoricSite,CurrentUseDesignation,NativeGrowthProtEsmt,Easements,OtherDesignation,DeedRestrictions,DevelopmentRightsPurch,CoalMineHazard,CriticalDrainage,ErosionHazard,LandfillBuffer,HundredYrFloodPlain,SeismicHazard,LandslideHazard,SteepSlopeHazard,Stream,Wetland,SpeciesOfConcern,SensitiveAreaTract,WaterProblems,TranspConcurrency,OtherProblems,PIN,Major_y,Minor_y,BldgNbr,NbrLivingUnits,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,DirectionSuffix,ZipCode,Stories,BldgGrade,BldgGradeVar,SqFt1stFloor,SqFtHalfFloor,SqFt2ndFloor,SqFtUpperFloor,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,BrickStone,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot_y,PlatBlock_y,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning
3,6,22603,9181,,,,,3,26,2,NW,R,1,1,,,SHORELINE,2263,R6,1,1,2,10560,2,2,4,0,1,0,0,0,False,0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,N,N,0,0,0,0,N,N,0,0,N,N,N,0,0,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,226039181,22603,9181,1,1,20115 24TH AVE NW 98177,20115,,,24TH,AVE,NW,98177.0,1.0,9,0,2350,0,0,0,0,0,4200,2410,1850,8,560,0,Y,220,0,390,7,3,0,N,3,1,1,2,0,1,0,1,1968,0,0,0,0,4,0,2743363,22603,9181,07/13/2015,800000,20150715002679,,,,,,,SOLBERG ANDERS B+TORBJORG M -REV LVG TRUST ...,DOTSON JEFFRY S+BRENDA H ...,3,6,3,N,N,N,N,1,8,15 51
4,7,229670,160,,ELDORADO NORTH,16,,5,26,19,SW,R,37,2,,,KIRKLAND,1708,RSA 6,1,1,2,9853,2,2,4,0,1,0,2,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N,N,0,0,0,0,N,N,0,0,N,N,N,0,0,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,2296700160,229670,160,1,1,9032 NE 132ND PL 98034,9032,,NE,132ND,PL,,98034.0,1.0,7,0,1210,0,0,0,0,0,2410,1200,1200,6,0,500,Y,110,0,0,5,2,0,,5,0,0,3,0,1,0,1,1969,0,0,0,0,3,0,3002350,229670,160,07/21/2019,730000,20190731000710,,,,,,,YOUNG NATHAN P+ANNE E ...,ROE STEPHEN+MAVIS ...,3,6,3,N,N,N,N,1,8,
8,16,882490,210,,UNIVERSITY PLACE ADD,1-2,4.0,4,25,9,NW,R,44,1,,,SEATTLE,10,SF 5000,1,1,2,3600,2,2,4,0,1,0,2,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N,N,0,0,0,0,N,N,0,0,N,N,N,0,0,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,8824900210,882490,210,1,1,2007 NE 63RD ST 98115,2007,,NE,63RD,ST,,98115.0,1.5,8,0,1300,590,0,0,0,0,1890,750,0,0,220,0,,190,0,0,5,1,0,,4,0,0,1,0,1,0,0,1919,0,0,0,0,3,0,2958914,882490,210,10/17/2018,875000,20181025000379,,,,,,,WALSH ANNETTE ...,GERHARD LUTZ ...,11,6,3,N,N,N,N,1,8,
11,31,766370,784,,SEATTLE SUBURBAN HOME TRS,5,16.0,4,26,21,NE,R,7,6,,,SEATTLE,10,SF 7200,1,1,2,7750,2,2,4,0,1,0,2,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N,N,0,0,1,0,N,N,0,0,N,N,N,0,0,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,7663700784,766370,784,1,1,14007 30TH AVE NE,14007,,,30TH,AVE,NE,,1.0,8,0,2380,0,0,0,0,0,3340,2180,960,8,750,0,,250,0,0,5,2,0,,5,1,1,2,2,0,0,0,2019,0,58,0,0,3,5000,2833968,766370,784,11/09/2016,249950,20161115000386,,,,,,,HILL ROBERT ...,LT HOMES LLC ...,11,6,3,,,,,1,7,10
12,31,766370,784,,SEATTLE SUBURBAN HOME TRS,5,16.0,4,26,21,NE,R,7,6,,,SEATTLE,10,SF 7200,1,1,2,7750,2,2,4,0,1,0,2,0,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,N,N,0,0,1,0,N,N,0,0,N,N,N,0,0,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,7663700784,766370,784,1,1,14007 30TH AVE NE,14007,,,30TH,AVE,NE,,1.0,8,0,2380,0,0,0,0,0,3340,2180,960,8,750,0,,250,0,0,5,2,0,,5,1,1,2,2,0,0,0,2019,0,58,0,0,3,5000,2793085,766370,784,04/19/2016,205000,20160429000534,,,,,,,GIPAYA LILY TOLETE -TTEE ...,HILL ROBERT+DEBORAH K ...,3,6,3,N,N,N,N,1,8,15


In [43]:
# Dropping columns and rows not needed for this analysis

In [44]:
df.drop(index_names1, axis=0, inplace=True)

In [45]:
df.drop(drop_later1, axis=1, inplace=True)

In [46]:
df.drop(drop_later2, axis=1, inplace=True)

In [47]:
df.drop(drop_later3, axis=1, inplace=True)

In [48]:
drop_later4 = ['PIN', 'Major_x','Minor_x','Major_y','Minor_y','PlatLot_x','PlatBlock_x',
               'PlatLot_y', 'PlatBlock_y','Major','Minor','BrickStone','TidelandShoreland', 
               'TrafficNoise', 'AirportNoise', 'AdjacentGreenbelt','Area']

In [49]:
df.drop(drop_later4, axis=1, inplace=True)

In [50]:
# Cleaning col

In [51]:
df['ViewUtilization'] = df['ViewUtilization'].str.upper()

In [52]:
df['ViewUtilization'] = df['ViewUtilization'].str.strip()

In [53]:
# Reordering to have target variable first.

In [54]:
cols = list(df.columns)
cols = [cols[-2]] + cols[:-2] + cols[-1:]
df = df[cols]

In [55]:
df.head()

Unnamed: 0,SalePrice,Township,SqFtLot,PcntUnusable,MtRainier,Olympics,Cascades,Territorial,SeattleSkyline,PugetSound,LakeWashington,LakeSammamish,SmallLakeRiverCreek,OtherView,WfntFootage,BldgGrade,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,DaylightBasement,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,ViewUtilization,Bedrooms,BathHalfCount,Bath3qtrCount,BathFullCount,FpSingleStory,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Condition,AddnlCost,SaleWarning
3,800000,26,10560,0,0,2,0,2,0,2,0,0,0,0,0,9,0,0,4200,2410,1850,8,560,0,Y,220,0,390,7,3,N,3,1,1,2,0,1,0,1,1968,0,0,4,0,15 51
4,730000,26,9853,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,2410,1200,1200,6,0,500,Y,110,0,0,5,2,,5,0,0,3,0,1,0,1,1969,0,0,3,0,
8,875000,25,3600,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,1890,750,0,0,220,0,,190,0,0,5,1,,4,0,0,1,0,1,0,0,1919,0,0,3,0,
11,249950,26,7750,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,3340,2180,960,8,750,0,,250,0,0,5,2,,5,1,1,2,2,0,0,0,2019,0,58,3,5000,10
12,205000,26,7750,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,3340,2180,960,8,750,0,,250,0,0,5,2,,5,1,1,2,2,0,0,0,2019,0,58,3,5000,15


In [56]:
view_dummies = pd.get_dummies(df.ViewUtilization, drop_first=True, prefix='View')
df = pd.concat([df, view_dummies], axis=1)

In [57]:
# Combining bathroom and fireplace data.

In [58]:
df['BathHalfCount'] = df['BathHalfCount'].apply(lambda x: x*.5)
df['Bath3qtrCount'] = df['Bath3qtrCount'].apply(lambda x: x*.75)
df['TotBathrooms'] = df['BathHalfCount'] + df['Bath3qtrCount'] + df['BathFullCount']

In [59]:
bathrooms = ['BathHalfCount','Bath3qtrCount', 'BathFullCount']

df.drop(bathrooms, axis=1, inplace=True)

In [60]:
df = df[(df.TotBathrooms >=1)] 
len(df) #65482

65482

In [61]:
df = df[(df.Bedrooms >=1)] 
len(df) #65462

65462

In [62]:
df['TotFireplace'] = df.FpSingleStory + df.FpMultiStory + df.FpFreestanding + df.FpAdditional
fire_drop =['FpSingleStory', 'FpMultiStory', 'FpFreestanding', 'FpAdditional']
df.drop(fire_drop, axis=1, inplace=True)

In [63]:
# Converting categorical data

In [64]:
# if (df['SaleWarning']).bool == True:
#     df['SaleWarning'] = df['SaleWarning'].str.split(" ").tolist()
# df['NumSaleWarning'] = df['SaleWarning'].apply(lambda x: len(x)-1)

# df.drop('SaleWarning', axis=1, inplace=True )

In [65]:
# Dropping more unnecessary columns.

In [66]:
others = ['PcntUnusable','DaylightBasement']
df.drop(others, axis=1, inplace=True)

In [67]:
# Checking my work

In [68]:
df.head()

Unnamed: 0,SalePrice,Township,SqFtLot,MtRainier,Olympics,Cascades,Territorial,SeattleSkyline,PugetSound,LakeWashington,LakeSammamish,SmallLakeRiverCreek,OtherView,WfntFootage,BldgGrade,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,FinBasementGrade,SqFtGarageBasement,SqFtGarageAttached,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,HeatSystem,HeatSource,ViewUtilization,Bedrooms,YrBuilt,YrRenovated,PcntComplete,Condition,AddnlCost,SaleWarning,View_N,View_Y,TotBathrooms,TotFireplace
3,800000,26,10560,0,2,0,2,0,2,0,0,0,0,0,9,0,0,4200,2410,1850,8,560,0,220,0,390,7,3,N,3,1968,0,0,4,0,15 51,1,0,3.25,2
4,730000,26,9853,0,0,0,0,0,0,0,0,0,0,0,7,0,0,2410,1200,1200,6,0,500,110,0,0,5,2,,5,1969,0,0,3,0,,0,0,3.0,2
8,875000,25,3600,0,0,0,0,0,0,0,0,0,0,0,8,0,0,1890,750,0,0,220,0,190,0,0,5,1,,4,1919,0,0,3,0,,0,0,1.0,1
11,249950,26,7750,0,0,0,0,0,0,0,0,0,0,0,8,0,0,3340,2180,960,8,750,0,250,0,0,5,2,,5,2019,0,58,3,5000,10,0,0,3.25,2
12,205000,26,7750,0,0,0,0,0,0,0,0,0,0,0,8,0,0,3340,2180,960,8,750,0,250,0,0,5,2,,5,2019,0,58,3,5000,15,0,0,3.25,2


In [69]:
df.describe()

Unnamed: 0,SalePrice,SqFtLot,WfntFootage,SqFtUnfinFull,SqFtUnfinHalf,SqFtTotLiving,SqFtTotBasement,SqFtFinBasement,SqFtGarageBasement,SqFtGarageAttached,SqFtOpenPorch,SqFtEnclosedPorch,SqFtDeck,Bedrooms,YrBuilt,YrRenovated,PcntComplete,AddnlCost,View_N,View_Y,TotBathrooms,TotFireplace
count,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0,65462.0
mean,906194.7,18657.57,1.359567,0.683099,0.513443,2768.533088,383.44493,260.608139,81.102517,471.447084,156.923207,6.508417,122.271516,3.844383,1993.612248,64.045767,0.82402,1375.462711,0.490391,0.015658,2.685806,1.334515
std,1085100.0,58473.53,14.995312,19.787659,18.71066,1041.808793,626.118384,484.640243,202.102409,269.296432,153.991336,49.739348,589.449233,0.829997,24.474031,352.50489,7.72611,3827.571798,0.499911,0.124149,0.789783,0.721588
min,5.0,643.0,0.0,0.0,0.0,360.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1900.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,454950.0,5459.0,0.0,0.0,0.0,2060.0,0.0,0.0,0.0,390.0,60.0,0.0,0.0,3.0,1979.0,0.0,0.0,0.0,0.0,0.0,2.5,1.0
50%,680000.0,7760.0,0.0,0.0,0.0,2650.0,0.0,0.0,0.0,470.0,110.0,0.0,0.0,4.0,2002.0,0.0,0.0,0.0,0.0,0.0,2.5,1.0
75%,1003979.0,11500.0,0.0,0.0,0.0,3310.0,840.0,420.0,0.0,630.0,204.0,0.0,177.5,4.0,2015.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0
max,37500000.0,3152153.0,1300.0,1340.0,1320.0,14980.0,5610.0,5610.0,2610.0,4460.0,3000.0,2330.0,140000.0,13.0,2020.0,2020.0,100.0,100000.0,1.0,1.0,10.0,8.0


In [70]:
## Saving dataframe for use in analysis.

In [71]:
df.to_csv('merged_data.csv', index=False)