In [26]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List

test_fp = r'data/house-prices-advanced-regression-techniques/test.csv'

In [58]:
df = pd.read_csv(test_fp, index_col=0)

df_num = df.select_dtypes(exclude='object')
df_obj = df.select_dtypes(include='object')

print(df.dtypes.value_counts())

object     43
int64      25
float64    11
dtype: int64


In [59]:
df_num.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,439.203704,52.619342,...,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,42.74688,22.376841,4955.517327,1.436812,1.11374,30.390071,21.130467,177.6259,455.268042,176.753926,...,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,0.0,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,0.0,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,1526.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [60]:
df_obj.describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1455,1459,107,1459,1459,1457,1459,1459,1459,1459,...,1383,1381,1381,1381,1459,3,290,51,1458,1459
unique,5,2,2,4,4,1,5,3,25,9,...,6,3,4,5,3,2,4,3,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Ex,MnPrv,Shed,WD,Normal
freq,1114,1453,70,934,1311,1457,1081,1396,218,1251,...,853,625,1293,1328,1301,2,172,46,1258,1204


# String Encoding

We got 43 columns of text --> should be modified.
Moreover some of them are:
* 12: filled with score string ('Excellent', 'Good', etc...)
* 5: filled with similar to score string (Good, average, below,...)
* ?: with qualitative string:
    - [brick, stone, wood]
    - [finished,..]
    - [paved, dirt,..]
    - [privacy,...]

We got to modify this string into numerical value. To do so we will:
* find all columns with the string specified above
* convert those columns with numerical value (based on the strings)
* repeat for each column tipe

For the others strings columns we can compare their value to the house price and try to find a first level correlation (for example neighrborhood or garage,...)

In [61]:
def conv_col_str2num(df:pd.DataFrame, str_list:List[str], num_list:List[float]=None)->pd.DataFrame:
    """
    Find columns which contains all strings of the string list.
    Replace those columns strings with the numeric list provided.
    Return the new dataframe of those columns and the columns name

    Args:
        df (pd.DataFrame): this to convert
        str_list (List[str]): Categories to find and to be converted. Expected to be from best to worse
        num_list (List[int], optional): score to replace each of the categories. defaults to a an automatoc score from best to worse.
    
    Returns:
        pd.DataFrame: contains converted categorical values to numeric score
    """

    if num_list is None:
        M = len(str_list) - 1
        num_list = [x/M for x in range(M, -1, -1)]

    x_str = df.isin(str_list).all()
    col = df.columns[x_str]
    df_rep = df[col].replace(str_list, num_list)
    df_new = df.drop(col, axis=1).join(df_rep)

    return df_new

In [62]:
str_lists = [
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
    ['Ex', 'Gd', 'TA', 'Fa' 'NA'],
    ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA'],
    ['Y', 'P', 'N'],
    ['Fin', 'RFn', 'Unf', 'NA'],
    ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
    ['Y', 'N'],
    ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Ubf', 'NA'],
    ['Gd', 'Av', 'Mn', 'No', 'NA'],
    ['Gtl', 'Mod', 'Sev'],
    ['AllPub', 'NoSewr', 'NoSeWa','ELO'],
    ['Lvl', 'Bnk', 'HLS', 'Low'],
    ['Reg', 'IR1', 'IR2', 'IR3'],
]

for str_list in str_lists:
    df_obj = conv_col_str2num(df_obj, str_list)

print(df_obj.dtypes.value_counts())
df_obj

object     38
float64     5
dtype: int64


Unnamed: 0_level_0,MSZoning,Street,Alley,Utilities,LotConfig,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,...,SaleType,SaleCondition,ExterQual,ExterCond,HeatingQC,CentralAir,PavedDrive,LandSlope,LandContour,LotShape
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,RH,Pave,,AllPub,Inside,NAmes,Feedr,Norm,1Fam,1Story,...,WD,Normal,0.6,0.6,0.6,1.0,1.0,1.0,1.000000,1.000000
1462,RL,Pave,,AllPub,Corner,NAmes,Norm,Norm,1Fam,1Story,...,WD,Normal,0.6,0.6,0.6,1.0,1.0,1.0,1.000000,0.666667
1463,RL,Pave,,AllPub,Inside,Gilbert,Norm,Norm,1Fam,2Story,...,WD,Normal,0.6,0.6,0.8,1.0,1.0,1.0,1.000000,0.666667
1464,RL,Pave,,AllPub,Inside,Gilbert,Norm,Norm,1Fam,2Story,...,WD,Normal,0.6,0.6,1,1.0,1.0,1.0,1.000000,0.666667
1465,RL,Pave,,AllPub,Inside,StoneBr,Norm,Norm,TwnhsE,1Story,...,WD,Normal,0.8,0.6,1,1.0,1.0,1.0,0.333333,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,RM,Pave,,AllPub,Inside,MeadowV,Norm,Norm,Twnhs,2Story,...,WD,Normal,0.6,0.6,0.8,1.0,1.0,1.0,1.000000,1.000000
2916,RM,Pave,,AllPub,Inside,MeadowV,Norm,Norm,TwnhsE,2Story,...,WD,Abnorml,0.6,0.6,0.6,1.0,1.0,1.0,1.000000,1.000000
2917,RL,Pave,,AllPub,Inside,Mitchel,Norm,Norm,1Fam,1Story,...,WD,Abnorml,0.6,0.6,1,1.0,1.0,1.0,1.000000,1.000000
2918,RL,Pave,,AllPub,Inside,Mitchel,Norm,Norm,1Fam,SFoyer,...,WD,Normal,0.6,0.6,0.6,1.0,1.0,1.0,1.000000,1.000000
