In [177]:
import pandas as pd
import numpy as np

In [178]:
df = pd.read_csv('files/train.csv')
df = df.sample(frac = 1)

In [179]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


# Check for NaN columns..


In [180]:
for cols in df.columns:
    if df[cols].isnull().any():
        print(cols)

LotFrontage
Alley
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature


# Let's go through how I might want to treat each column, before I get into dealing with NaN's...

## MSSubClass

The values it takes are kind of weird considering its categorical data. If I decide to use a decision tree-based algorithm then it's fine. If not I'll need to at least normailize this one.

## MSZoning 
Should be one-hot encoded.

## LotFrontage, LotArea
Numerical data. We're dealing with a bunch of different units here, and if I wasn't working with a decision tree I'd really need to be careful to normalize and standardize the data.

## Street, Alley, LandContour, LotConfig, Neighborhood, Condition1, Condition2, BldgType, HouseStyle
Categorical. To be one-hot encoded.

## OverallCond and OverallQual
Numerical.

## YearBuilt, YearRemodAdd
Numerical data. Need to think about how I should treat this. Could maybe base them off of how many days ago they were built / remodeled.

## RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType
All categorical.

## MasVnrArea
Numerical.

## LotShape, Utilities, LandSlope, ExterQual, ExterCond
Categorical, but admits an ordered set, with Excellent > Good > Average/Typical etc.. so can convert to numerical to save on columns

## Foundation
Categorical.

## BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1
Categorical, but again admitting an ordered set, so should be converted into numerical data to save on columns.

## BsmtFinSF1
Numerical.

## BsmtFinType2
Categorical, but admitting an ordered set -- convert to numeric.

## BsmtFinSF2, BsmtUnfSF, TotalBsmtSF
Numerical.

## Heating
Categorical.

## HeatingQC
Categorical, but ordered set so convert to numeric.

## CentralAir, Electrical
Categorical.

## 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath,  BsmtHalfBath, FullBath, HalfBath, Bedroom, Kitchen
Numerical

## KitchenQual
Categoric, but admits ordered set, so convert to numerical.

## TotRmsAbvGrd
Numerical.

## Functional
Categorical, but admits ordered set. Convert to numerical.

## Fireplaces
Numerical.

## FireplaceQu
Categorical -- convertable to numerical.

## GarageType
Categorical.

## GarageFinish
Categorical -- convertable to numerical.

## GarageYrBlt
Possibly convert to 'how many days ago'.

## GarageCars, GarageArea
Numerical.

## GarageQual, GarageCond
Categorical -- convertable to numerical.

## PavedDrive
I think this admits an ordered set in terms of 'paved-ness'. So possibly convertable to numerical, although could be safe and just keep it categorical -- only like 3 unique values.

## WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea
Numerical.

## PoolQC, Fence
Categorical -- convertable to numerical.

## MiscFeature
Categorical.

## MiscVal
Numerical.

## MoSold , YrSold
MoSold might not be worth the effort to include. I won't want to one-hot encode 12 columns for each month, and I doubt the month something was sold can be worth the 12 columns I'd be costing the training data. YrSold I can express in terms of 'days since the present'. I could maybe express MoSold as numerical data of values between 1 and 12. But I don't think that's justifiable because it's weird to say that month 2 > month 1 or something.

## SaleType, SaleCondition
Categorical.

# Okay, with that, here are my next steps:

There isn't a whole lot of rows to this data, so if I add too many columns we start worrying about big-p, little-n issues. I need to try and impute my NaN's instead of deleting those rows so that the little-n doesn't get even smaller. I'll also really want to favor converting categorical data that admits an ordered set into numerical data so I can save up on columns.
1. Create a function to deal with missing NaN values sample-by-sample by selecting a subset of the data that shares as many characteristics as possible with a sample and picking the most likely value to impute given this. 
2. Create a function that converts categorical-but-numerical-convertable columns into numeric columns.
3. Deal with time-series columns.

It might be worth trying to find similar columns by one-hot encoding and normalizing data, and seeing which samples have the highest cosine similarity to the one in question.. I'd need to normalize numerical data, or else different magnitudes would have disproportionate influence on the the cosine similarity..

# One-hot encoding categorical data

Also going to drop the MoSold column.

In [181]:
df = df.drop(columns = ['MoSold','Id'])

In [182]:
to_one_hot = [
    'MiscFeature',
    'SaleType',
    'SaleCondition',
    'CentralAir', 
    'Electrical',
    'Heating',
    'Foundation',
    'RoofStyle', 
    'RoofMatl', 
    'Exterior1st', 
    'Exterior2nd', 
    'MasVnrType',
    'Street', 
    'Alley', 
    'LandContour', 
    'LotConfig', 
    'Neighborhood', 
    'Condition1', 
    'Condition2', 
    'BldgType', 
    'HouseStyle',
    'MSZoning',
    'MSSubClass',
    'GarageType',
    'PavedDrive'
    ]

In [183]:
time_series_columns = ['YrSold','YearBuilt','YearRemodAdd','GarageYrBlt']

And now for the data I want to make numeric.

In [184]:
numerical = [
    'MiscVal',
    'WoodDeckSF', 
    'OpenPorchSF', 
    'EnclosedPorch', 
    '3SsnPorch', 
    'ScreenPorch', 
    'PoolArea',
    'GarageCars', 
    'GarageArea',
    'Fireplaces',
    'TotRmsAbvGrd',
    '1stFlrSF', 
    '2ndFlrSF', 
    'LowQualFinSF', 
    'GrLivArea', 
    'BsmtFullBath',  
    'BsmtHalfBath', 
    'FullBath', 
    'HalfBath', 
    'BedroomAbvGr', 
    'KitchenAbvGr',
    'BsmtFinSF2', 
    'BsmtUnfSF', 
    'TotalBsmtSF',
    'BsmtFinSF1',
    'MasVnrArea',
    'LotFrontage', 
    'LotArea',
    'OverallCond',
    'OverallQual',
      
]

In [185]:
columns_so_far = to_one_hot + time_series_columns + numerical

In [186]:
full_columns = list(df.columns)

In [187]:
to_numerical = list(set(columns_so_far).symmetric_difference(full_columns))

In [188]:
to_numerical.remove('SalePrice')

In [197]:
to_numerical

['GarageFinish',
 'Functional',
 'GarageCond',
 'HeatingQC',
 'BsmtCond',
 'PoolQC',
 'KitchenQual',
 'Utilities',
 'ExterCond',
 'BsmtExposure',
 'ExterQual',
 'GarageQual',
 'Fence',
 'LotShape',
 'FireplaceQu',
 'LandSlope',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual']

## One-hot encode the categorical columns

In [190]:
df = pd.get_dummies(df,columns = to_one_hot)

## Convert the to_numerical data to numerical

In [191]:
def conv_to_numerical(df, col, ordered_categories):
    ordering = list(reversed(range(len(ordered_categories))))
    mappings = dict(list(zip(ordered_categories, ordering)))
    df[col] = [mappings[x] for x in df[col]]
    return df

In [192]:
list(df[to_numerical[0]].unique())

['Unf', 'RFn', 'Fin', nan]

# Normalize the numerical columns for the cosine similarity steps
Normalizing is not generally a good idea for a decision tree-based model, so I'll probably make it separate to df.

In [193]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

cos_sim_df = df.copy()
#cos_sim_df = cos_sim_df.drop(columns = numerical)
cos_sim_df[numerical] = min_max_scaler.fit_transform(df[numerical])

In [194]:
cos_sim_df

Unnamed: 0,LotFrontage,LotArea,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,MSSubClass_190,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,PavedDrive_N,PavedDrive_P,PavedDrive_Y
124,0.092466,0.073584,IR1,AllPub,Gtl,0.555556,0.500,1979,1998,0.000,...,0,0,1,0,0,0,0,0,0,1
159,0.386986,0.084498,IR1,AllPub,Gtl,0.666667,0.500,2005,2006,0.285,...,0,0,1,0,0,0,0,0,0,1
1074,0.181507,0.033915,Reg,AllPub,Gtl,0.666667,0.500,2006,2006,0.000,...,0,0,1,0,0,0,0,0,0,1
1263,0.133562,0.057094,Reg,AllPub,Gtl,0.555556,0.625,1919,1950,0.000,...,0,0,0,0,0,0,1,1,0,0
107,0.099315,0.021968,Reg,AllPub,Gtl,0.444444,0.500,1948,1950,0.000,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,0.253425,0.068078,IR1,AllPub,Mod,0.777778,0.625,1970,1970,0.000,...,0,0,1,0,0,0,0,0,0,1
1030,,0.027026,Reg,AllPub,Gtl,0.444444,0.875,1916,1995,0.000,...,1,0,0,0,0,0,0,1,0,0
391,0.171233,0.050990,IR1,AllPub,Gtl,0.555556,0.500,2001,2002,0.000,...,0,0,0,0,1,0,0,0,0,1
369,,0.039870,IR1,AllPub,Gtl,0.444444,0.750,1959,2006,0.000,...,0,0,0,0,0,0,1,0,0,1


In [195]:
def smart_fillna(column):
    

SyntaxError: unexpected EOF while parsing (800286481.py, line 2)