# Clean up the full train and test datasets

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint
from IPython.display import display, HTML
from sklearn.preprocessing import Imputer
from sklearn.base import TransformerMixin

## Load Data

In [3]:
train_file = "../data/train.csv"
train = pd.read_csv(train_file, index_col=0)

test_file = "../data/test.csv"
test = pd.read_csv(test_file, index_col=0)

## Inspect data
Lets check to see if train and test data are ordered the same way... 

In [4]:
names = [] 
different_names = []
for i in range(len(test.columns)):
    names.append((i, train.columns[i], test.columns[i]))
    if not train.columns[i] == test.columns[i]:
        print("{}:{} name different!".format(i,train.columns[i]))
        different_names.append((i, train.columns[i], test.columns[i]))

print(len(different_names))

0


And if read_csv has applied the same data type to both train and test data

In [5]:
dtypes = []
different_dtypes = []
for i in range(len(test.columns)):
    dtypes.append((i, train.dtypes[i], test.dtypes[i]))
    if not train.dtypes[i] == test.dtypes[i]:
        print("{}:{} dtypes different!".format(i,train.columns[i]))
        different_dtypes.append((i, train.dtypes[i], test.dtypes[i]))
print(len(different_dtypes))

33:BsmtFinSF1 dtypes different!
35:BsmtFinSF2 dtypes different!
36:BsmtUnfSF dtypes different!
37:TotalBsmtSF dtypes different!
46:BsmtFullBath dtypes different!
47:BsmtHalfBath dtypes different!
60:GarageCars dtypes different!
61:GarageArea dtypes different!
8


In [6]:
different_dtypes

[(33, dtype('int64'), dtype('float64')),
 (35, dtype('int64'), dtype('float64')),
 (36, dtype('int64'), dtype('float64')),
 (37, dtype('int64'), dtype('float64')),
 (46, dtype('int64'), dtype('float64')),
 (47, dtype('int64'), dtype('float64')),
 (60, dtype('int64'), dtype('float64')),
 (61, dtype('int64'), dtype('float64'))]

Inspecting those differences, we see that they're all float vs int (no categorical object vs number).  Wonder why it interpreted some as int and some as float... Does the test data have fractions?

In [7]:
# Make a copy to do this, so I can add in two control columns to make sure my syntax is correct...
find_remainders = test.iloc[:, [x[0] for x in different_dtypes]].copy()
find_remainders['float'] = 1.5
find_remainders['int'] = 1

print(find_remainders.dtypes)
print((find_remainders % 1 > 0).any())

BsmtFinSF1      float64
BsmtFinSF2      float64
BsmtUnfSF       float64
TotalBsmtSF     float64
BsmtFullBath    float64
BsmtHalfBath    float64
GarageCars      float64
GarageArea      float64
float           float64
int               int64
dtype: object
BsmtFinSF1      False
BsmtFinSF2      False
BsmtUnfSF       False
TotalBsmtSF     False
BsmtFullBath    False
BsmtHalfBath    False
GarageCars      False
GarageArea      False
float            True
int             False
dtype: bool


No fractions in test data.  Are there nan?

In [8]:
pd.isnull(train.iloc[:, [x[0] for x in different_dtypes]]).any()

BsmtFinSF1      False
BsmtFinSF2      False
BsmtUnfSF       False
TotalBsmtSF     False
BsmtFullBath    False
BsmtHalfBath    False
GarageCars      False
GarageArea      False
dtype: bool

In [9]:
pd.isnull(test.iloc[:, [x[0] for x in different_dtypes]]).any()

BsmtFinSF1      True
BsmtFinSF2      True
BsmtUnfSF       True
TotalBsmtSF     True
BsmtFullBath    True
BsmtHalfBath    True
GarageCars      True
GarageArea      True
dtype: bool

Aha!  Seems like presence of NAN might have been what forced them to be float.  I think np.int64 cannot have NAN?  Yeah, see this: http://stackoverflow.com/questions/11548005/numpy-or-pandas-keeping-array-type-as-integer-while-having-a-nan-value

In [10]:
nan_rows_test = pd.isnull(test.iloc[:, [x[0] for x in different_dtypes]]).any(1).nonzero()[0]

In [11]:
test.iloc[nan_rows_test, [x[0] for x in different_dtypes]]

Unnamed: 0_level_0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2121,,,,,,,1.0,280.0
2189,0.0,0.0,0.0,0.0,,,2.0,624.0
2577,548.0,0.0,311.0,859.0,0.0,0.0,,


What about our other floats...?  Are they also floats because of this? 

In [12]:
# Find all other floats (use train because we know the extra floats above from test are not in train)
cols_float = train.select_dtypes(include=[np.float64]).columns.tolist()
# dtypes = {'int':np.int64,
#           'float':np.float64,
#           'other':np.object,}
# col_by_dt = {}
# for dt in dtypes.keys():
#     col_by_dt[dt] = train.columns[train.dtypes == dtypes[dt]].tolist()

In [13]:
print("Columns cast as float: {}".format(cols_float))

Columns cast as float: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


Do any of these floats have non-integer numbers?

In [14]:
(train[cols_float] % 1 ).any()

LotFrontage    False
MasVnrArea     False
GarageYrBlt    False
dtype: bool

Do they have NaNs?

In [15]:
print(pd.isnull(train[cols_float]).any())
print("Rows with NaNs:")
print(pd.isnull(train[cols_float]).any(1).nonzero()[0])

LotFrontage    True
MasVnrArea     True
GarageYrBlt    True
dtype: bool
Rows with NaNs:
[   7   12   14   16   24   31   39   42   43   48   50   64   66   76   78
   84   88   89   95   99  100  104  108  111  113  116  120  125  126  127
  131  133  136  140  147  148  149  152  153  155  160  163  165  166  169
  170  177  180  186  191  198  203  207  208  210  214  218  221  234  237
  241  244  249  250  269  287  288  291  293  307  308  310  319  328  330
  335  342  346  347  351  356  360  361  364  366  369  370  375  384  386
  392  393  404  405  412  421  426  431  434  441  447  452  457  458  459
  464  465  470  484  490  495  496  516  518  520  528  529  533  535  537
  538  539  541  545  559  560  562  564  569  580  582  593  610  611  612
  613  614  616  620  623  626  635  636  638  641  645  649  650  660  666
  668  672  679  682  685  687  690  705  706  709  710  714  720  721  726
  734  738  745  746  750  751  757  770  783  784  785  789  791  794  811


In [16]:
nan_rows_train = pd.isnull(train[cols_float]).any(1)
train[nan_rows_train][cols_float]

Unnamed: 0_level_0,LotFrontage,MasVnrArea,GarageYrBlt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,,240.0,1973.0
13,,0.0,1962.0
15,,212.0,1960.0
17,,180.0,1970.0
25,,0.0,1968.0
32,,0.0,1966.0
40,65.0,0.0,
43,,0.0,1983.0
44,,0.0,1977.0
49,33.0,0.0,


Definitely...  So really we have either ints (with or without NaN's), and other (currently objects - likely categorical?)

## Process the numeric data with NaN

**Should come back and clean this printing up.  For now, just cut-and-pasted the above visuals that we had used before to check**

Get the numeric columns

In [17]:
cols_numeric = train.select_dtypes(include=[np.number]).columns.tolist()

# Don't want to impute on sale price...
cols_numeric.remove('SalePrice')
print(cols_numeric)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


Make an imputer for the numeric data

In [18]:
# Look down each column and replace missing values with the median of that column 
# (since we're using integers, median seemed to make more sense)
imp_numeric = Imputer(missing_values='NaN', strategy='median', axis=0)
imp_numeric.fit(train[cols_numeric])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

Use imputer to fill in the blanks!

In [21]:
# train_imputed = train.copy()
train[cols_numeric] = imp_numeric.transform(train[cols_numeric])
# test_imputed  = test.copy()
test[cols_numeric]  = imp_numeric.transform(test[cols_numeric])

And looking back at that data we were missing before...

In [22]:
train[nan_rows_train][cols_float]

Unnamed: 0_level_0,LotFrontage,MasVnrArea,GarageYrBlt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,69.0,240.0,1973.0
13,69.0,0.0,1962.0
15,69.0,212.0,1960.0
17,69.0,180.0,1970.0
25,69.0,0.0,1968.0
32,69.0,0.0,1966.0
40,65.0,0.0,1980.0
43,69.0,0.0,1983.0
44,69.0,0.0,1977.0
49,33.0,0.0,1980.0


In [24]:
test.iloc[nan_rows, [x[0] for x in different_dtypes]]

Unnamed: 0_level_0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2121,383.5,0.0,477.5,991.5,0.0,0.0,1.0,280.0
2189,0.0,0.0,0.0,0.0,0.0,0.0,2.0,624.0
2577,548.0,0.0,311.0,859.0,0.0,0.0,2.0,480.0


# Handle the non-numeric columns

## sklearn has OneHotEncoder, but pandas now has a built in get_dummies()
see:
* http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/
* http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

### Clean up any unintended NaN

Get the non-numeric columns


In [25]:
cols_non_numeric = train.select_dtypes(exclude=[np.number]).columns.tolist()
print(cols_non_numeric)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


See what sort of data is in each column

In [26]:
for c in cols_non_numeric:
    print("{:13s}: {}".format(c, train[c].unique()))
    print("{:13s}: {}".format("", train[c].value_counts().tolist()))

MSZoning     : ['RL' 'RM' 'C (all)' 'FV' 'RH']
             : [1151, 218, 65, 16, 10]
Street       : ['Pave' 'Grvl']
             : [1454, 6]
Alley        : [nan 'Grvl' 'Pave']
             : [50, 41]
LotShape     : ['Reg' 'IR1' 'IR2' 'IR3']
             : [925, 484, 41, 10]
LandContour  : ['Lvl' 'Bnk' 'Low' 'HLS']
             : [1311, 63, 50, 36]
Utilities    : ['AllPub' 'NoSeWa']
             : [1459, 1]
LotConfig    : ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
             : [1052, 263, 94, 47, 4]
LandSlope    : ['Gtl' 'Mod' 'Sev']
             : [1382, 65, 13]
Neighborhood : ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
             : [225, 150, 113, 100, 86, 79, 77, 74, 73, 59, 58, 51, 49, 41, 38, 37, 28, 25, 25, 17, 17, 16, 11, 9, 2]
Condition1   : ['Norm' 'Feedr' 'PosN' 'Artery' 'RR

Many columns have nan, but from the description several use nan as a value.  Also note that some have nan in their unique(), but not in their counts?  That doesn't make sense...

In [27]:
# Make lists for datasets that could intentionally have nan (typically they mean "I don't have this feature")
# Any record with a nan in a basement category should have a nan in all basement categories.  Same with garage.  
# Groupd those to check
nan_bsmt = ['BsmtQual', 'BsmtCond', 'BsmtExposure']
nan_garage = ['GarageType', 'GarageQual', 'GarageFinish', 'GarageCond']
nan_intentional = ['Alley', 
                   'FireplaceQu', 
                   'PoolQC', 'Fence', 'MiscFeature',] + nan_bsmt + nan_garage

# Get the other nan's (this would have been easier to just type the unintentional ones, but I was already done listing the intentionals..)
nan_unintentional = [train[cols_non_numeric].iloc[:,x].name for x in pd.isnull(train[cols_non_numeric]).any().nonzero()[0] if train[cols_non_numeric].iloc[:,x].name not in nan_intentional ]
print(nan_unintentional)

['MasVnrType', 'BsmtFinType1', 'BsmtFinType2', 'Electrical']


For unintentional nan values, replace those with their most common value.  Better ways to do this could include:
* Try to correlate these missing values to other features?

In [28]:
class CategoricalImputer(TransformerMixin):
    
    def __init__(self):
        """Impute missing categorical values by replacing them with the most common value.
        
        Adapted from http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn
        """
        
    def fit(self, X, y=None):
        # Make a pd.Series that has a fill value for each column (use np.nan for anything non-categorical)
        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else np.nan for c in X ], index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
# Example:

# data = [
#     ['a', 1, 2],
#     ['b', 1, 1],
#     ['b', 2, 2],
#     [np.nan, np.nan, np.nan]
# ]

# X = pd.DataFrame(data)
# xt = CategoricalImputer().fit_transform(X)

# print('before...')
# print(X)
# print('after...')
# print(xt)

In [29]:
train['Electrical'].value_counts()

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64

In [34]:
nan_unintentional

['MasVnrType', 'BsmtFinType1', 'BsmtFinType2', 'Electrical']

In [37]:
# This one acts up a bit because Id numbers are integers, but could also be indices.  
# In this case, adding 1 to the rows gets things lined up
nan_rows_train = pd.isnull(train[nan_unintentional]).any(1).nonzero()[0]
train.loc[nan_rows_train+1, nan_unintentional]

Unnamed: 0_level_0,MasVnrType,BsmtFinType1,BsmtFinType2,Electrical
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,,,,SBrkr
40,,,,FuseP
91,,,,FuseA
103,,,,SBrkr
157,,,,FuseF
183,BrkFace,,,SBrkr
235,,GLQ,Unf,SBrkr
260,,,,FuseA
333,BrkFace,GLQ,,SBrkr
343,BrkFace,,,FuseA


In [39]:
imp_categorical = CategoricalImputer()
imp_categorical.fit(train[nan_unintentional])
# train_imputed_categorical = train.copy()
train[nan_unintentional] = imp_categorical.transform(train[nan_unintentional])

In [41]:
train.loc[nan_rows_train+1, nan_unintentional]

Unnamed: 0_level_0,MasVnrType,BsmtFinType1,BsmtFinType2,Electrical
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,,Unf,Unf,SBrkr
40,,Unf,Unf,FuseP
91,,Unf,Unf,FuseA
103,,Unf,Unf,SBrkr
157,,Unf,Unf,FuseF
183,BrkFace,Unf,Unf,SBrkr
235,,GLQ,Unf,SBrkr
260,,Unf,Unf,FuseA
333,BrkFace,GLQ,Unf,SBrkr
343,BrkFace,Unf,Unf,FuseA


In [None]:
display(train_imputed_categorical[nan_unintentional].describe())
display(train_imputed_categorical[nan_unintentional].count())

** NEED TO DO CATEGORICAL SUBBING FOR TEST TOO.  UNINTENDED NAN LIST MIGHT BE DIFFERENT!!! **

### Make features from the categoricals

In [59]:
# train_augmented = pd.get_dummies(train, prefix='test1', dummy_na=True, columns=cols_non_numeric).drop(labels=cols_non_numeric, axis=1)
train_augmented = pd.get_dummies(train, dummy_na=True, columns=cols_non_numeric)
print(train_augmented.columns.tolist())
train_augmented.describe()

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'MSZoning_nan', 'Street_Grvl', 'Street_Pave', 'Street_nan', 'Alley_Grvl', 'Alley_Pave', 'Alley_nan', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LotShape_nan', 'LandContour_Bnk', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'LandContour_nan', 'Utilities_AllPub', 'Utilities_NoSeWa', 'Utilities_nan', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3'

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,69.863699,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,46.549315,...,0.002055,0.867808,0.0,0.069178,0.00274,0.008219,0.013699,0.820548,0.085616,0.0
std,42.300571,22.027677,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,161.319273,...,0.045299,0.338815,0.0,0.253844,0.052289,0.090317,0.116277,0.383862,0.279893,0.0
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [48]:
train.corr()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
MSSubClass,1.0,-0.356718,-0.139781,0.032628,-0.059316,0.02785,0.040581,0.023573,-0.069836,-0.065649,...,-0.012579,-0.0061,-0.012037,-0.043825,-0.02603,0.008283,-0.007683,-0.013585,-0.021407,-0.084284
LotFrontage,-0.356718,1.0,0.304522,0.234812,-0.053281,0.116685,0.083348,0.178469,0.214367,0.042463,...,0.075542,0.137014,0.010287,0.061945,0.037655,0.180819,-0.000255,0.010451,0.00638,0.334771
LotArea,-0.139781,0.304522,1.0,0.105806,-0.005636,0.014228,0.013788,0.103321,0.214103,0.11117,...,0.171698,0.084774,-0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,-0.014261,0.263843
OverallQual,0.032628,0.234812,0.105806,1.0,-0.091932,0.572323,0.550684,0.407252,0.239666,-0.059119,...,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,-0.027347,0.790982
OverallCond,-0.059316,-0.053281,-0.005636,-0.091932,1.0,-0.375983,0.073741,-0.125694,-0.046231,0.040229,...,-0.003334,-0.032589,0.070356,0.025504,0.054811,-0.001985,0.068777,-0.003511,0.04395,-0.077856
YearBuilt,0.02785,0.116685,0.014228,0.572323,-0.375983,1.0,0.592855,0.3116,0.249503,-0.049107,...,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,-0.013618,0.522897
YearRemodAdd,0.040581,0.083348,0.013788,0.550684,0.073741,0.592855,1.0,0.176529,0.128451,-0.067759,...,0.205726,0.226298,-0.193919,0.045286,-0.03874,0.005829,-0.010286,0.02149,0.035743,0.507101
MasVnrArea,0.023573,0.178469,0.103321,0.407252,-0.125694,0.3116,0.176529,1.0,0.261256,-0.07133,...,0.159991,0.122528,-0.109907,0.019144,0.062248,0.011928,-0.029512,-0.006723,-0.008317,0.472614
BsmtFinSF1,-0.069836,0.214367,0.214103,0.239666,-0.046231,0.249503,0.128451,0.261256,1.0,-0.050117,...,0.204306,0.111761,-0.102303,0.026451,0.062021,0.140491,0.003571,-0.015727,0.014359,0.38642
BsmtFinSF2,-0.065649,0.042463,0.11117,-0.059119,0.040229,-0.049107,-0.067759,-0.07133,-0.050117,1.0,...,0.067898,0.003093,0.036543,-0.029993,0.088871,0.041709,0.00494,-0.015211,0.031706,-0.011378


In [None]:
train[nan_unintentional].empty()

In [None]:
x = train[nan_unintentional].describe()

In [None]:
x.loc['top']

In [None]:
# Look down each column and replace missing values with the median of that column 
# (since we're using integers, median seemed to make more sense)
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(train[nan_unintentional])

In [None]:
len(cols_non_numeric)
len(pd.isnull(train[cols_non_numeric]).any().nonzero()[0])

Convert these to dummy variables

In [None]:
cols_non_numeric

In [None]:
train['SalePrice'].value_counts()

In [None]:
train[cols_non_numeric].count()

In [None]:
pd.isnull(train[cols_non_numeric]).any().nonzero().columns()

In [None]:
help(TransformerMixin)

In [None]:
data = [
    [None, 2, 3],
    [10, None, 30],
]
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.replace(10, 'abc')

In [None]:
pd.isnull(df)

In [None]:
df.count()

In [None]:
type(df[2][1])
