In [288]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
# check for nulls
print(df.shape)
print()

   
null_counts = df.isnull().sum()

null_columns_with_counts = {}

for key,value in null_counts.items():
    if value > 0:
        null_columns_with_counts[key] = {"count": value, "type": str(df[key].dtype)}

# 'Lot Frontage' => Set to 0
# 'Alley' => Set to "None"
# 'Mas Vnr Type' => Set to "None"
# 'Mas Vnr Area' => Set to 0
# 'Bsmt Qual' => Set to "NA"
# 'Bsmt Cond' => Set to "NA"
# 'Bsmt Exposure' => Set to "NA"
# 'BsmtFin Type 1' => Set to "NA"
# 'BsmtFin SF 1' => Set to 0
# 'BsmtFin Type 2' => Set to "NA"
# 'BsmtFin SF 2' => Set to 0
# 'Bsmt Unf SF' => Set to 0
# 'Bsmt Unf SF' => Set to 0
# 'Total Bsmt SF' => Set to 0
# 'Bsmt Full Bath' => Set to 0
# 'Bsmt Half Bath' => Set to 0
# 'Fireplace Qu' => Set to "NA"
# 'Garage Type' => Set to "NA"
# 'Garage Yr Blt' => Map to string, Set to "NA"
# 'Garage Finish' => Set to "NA"
# 'Garage Cars' => Drop it
# 'Garage Area' => Drop it
# 'Garage Qual' => Set to "NA"
# 'Garage Cond' => Set to "NA"
# 'Pool QC' => Set to "NA"
# 'Fence' => Set to "NA"
# 'Misc Feature' => Set to "NA"
null_columns_with_counts

(2051, 81)



{'Lot Frontage': {'count': 330, 'type': 'float64'},
 'Alley': {'count': 1911, 'type': 'object'},
 'Mas Vnr Type': {'count': 22, 'type': 'object'},
 'Mas Vnr Area': {'count': 22, 'type': 'float64'},
 'Bsmt Qual': {'count': 55, 'type': 'object'},
 'Bsmt Cond': {'count': 55, 'type': 'object'},
 'Bsmt Exposure': {'count': 58, 'type': 'object'},
 'BsmtFin Type 1': {'count': 55, 'type': 'object'},
 'BsmtFin SF 1': {'count': 1, 'type': 'float64'},
 'BsmtFin Type 2': {'count': 56, 'type': 'object'},
 'BsmtFin SF 2': {'count': 1, 'type': 'float64'},
 'Bsmt Unf SF': {'count': 1, 'type': 'float64'},
 'Total Bsmt SF': {'count': 1, 'type': 'float64'},
 'Bsmt Full Bath': {'count': 2, 'type': 'float64'},
 'Bsmt Half Bath': {'count': 2, 'type': 'float64'},
 'Fireplace Qu': {'count': 1000, 'type': 'object'},
 'Garage Type': {'count': 113, 'type': 'object'},
 'Garage Yr Blt': {'count': 114, 'type': 'float64'},
 'Garage Finish': {'count': 114, 'type': 'object'},
 'Garage Cars': {'count': 1, 'type': 'floa

In [6]:
def clean_data(data_frame):
    # loop through columns with null data
    for column in ['Lot Frontage','Alley','Mas Vnr Type','Mas Vnr Area','Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin SF 1','BsmtFin Type 2','BsmtFin SF 2','Bsmt Unf SF','Bsmt Unf SF','Total Bsmt SF','Bsmt Full Bath','Bsmt Half Bath','Fireplace Qu','Garage Type','Garage Yr Blt','Garage Finish','Garage Cars','Garage Area','Garage Qual','Garage Cond','Fence','Misc Feature', 'Pool QC']:
        if column in ['Lot Frontage','Mas Vnr Area','BsmtFin SF 1','BsmtFin SF 2','Bsmt Unf SF','Bsmt Unf SF','Total Bsmt SF','Bsmt Full Bath','Bsmt Half Bath','Garage Cars', 'Garage Area']:
            data_frame[column].fillna(0, inplace=True)
        elif column in ['Alley','Mas Vnr Type','Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2','Fireplace Qu','Garage Type','Garage Finish','Garage Qual','Garage Cond','Fence','Misc Feature','Pool QC']:
            data_frame[column].fillna("NA", inplace=True)
        elif column == 'Garage Yr Blt':
            data_frame[column].fillna("NA", inplace=True)
            data_frame[column] = data_frame[column].map(lambda x: str(x))

clean_data(df)

In [7]:
df_with_dummies = pd.get_dummies(df)

In [8]:
sp_wd_corr = df_with_dummies.corr()['SalePrice']
df_with_dummies[sp_wd_corr[sp_wd_corr > .5].keys()].head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Garage Cars,Garage Area,SalePrice,Foundation_PConc,Bsmt Qual_Ex,Kitchen Qual_Ex
0,6,1976,2005,289.0,725.0,725,1479,2,6,2.0,475.0,130500,0,0,0
1,7,1996,1997,132.0,913.0,913,2122,2,8,2.0,559.0,220000,1,0,0
2,5,1953,2007,0.0,1057.0,1057,1057,1,5,1.0,246.0,109000,0,0,0
3,5,2006,2007,0.0,384.0,744,1444,2,7,2.0,400.0,174000,1,0,0
4,6,1900,1993,0.0,676.0,831,1445,2,6,2.0,484.0,138500,1,0,0


In [11]:
sp_corr = df.corr()['SalePrice']
df[sp_corr[sp_corr > .5].keys()].head()

Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Garage Cars,Garage Area,SalePrice
0,6,1976,2005,289.0,725.0,725,1479,2,6,2.0,475.0,130500
1,7,1996,1997,132.0,913.0,913,2122,2,8,2.0,559.0,220000
2,5,1953,2007,0.0,1057.0,1057,1057,1,5,1.0,246.0,109000
3,5,2006,2007,0.0,384.0,744,1444,2,7,2.0,400.0,174000
4,6,1900,1993,0.0,676.0,831,1445,2,6,2.0,484.0,138500


In [390]:
from sklearn import linear_model

def get_r_scores(data_frame, column_name, target='price'):
    lm = linear_model.LinearRegression()
    local_fit = lm.fit(data_frame[column_name], data_frame[[target]])
    score = local_fit.score(data_frame[column_name], data_frame[[target]])
    print(f"{', '.join(column_name)} r2_score: {score}")
    return score

scores = {}

for column in sp_corr[sp_corr > .5].keys():
    score = get_r_scores(df_with_dummies, [column], target='SalePrice')
    if  score < 1:
        scores[column] = score

# https://stackoverflow.com/a/20948781/2548452
s = [(k, scores[k]) for k in sorted(scores, key=scores.get, reverse=True)]
s

Overall Qual r2_score: 0.6403310352003229
Year Built r2_score: 0.3270111597575761
Year Remod/Add r2_score: 0.30290675034176306
Mas Vnr Area r2_score: 0.25359139141207065
Total Bsmt SF r2_score: 0.39602238690537106
1st Flr SF r2_score: 0.38252510667511996
Gr Liv Area r2_score: 0.48586259284005934
Full Bath r2_score: 0.2894107548273267
TotRms AbvGrd r2_score: 0.254030432823128
Garage Cars r2_score: 0.4196206721721161
Garage Area r2_score: 0.42236591520046085
SalePrice r2_score: 1.0


[('Overall Qual', 0.6403310352003229),
 ('Gr Liv Area', 0.48586259284005934),
 ('Garage Area', 0.42236591520046085),
 ('Garage Cars', 0.4196206721721161),
 ('Total Bsmt SF', 0.39602238690537106),
 ('1st Flr SF', 0.38252510667511996),
 ('Year Built', 0.3270111597575761),
 ('Year Remod/Add', 0.30290675034176306),
 ('Full Bath', 0.2894107548273267),
 ('TotRms AbvGrd', 0.254030432823128),
 ('Mas Vnr Area', 0.25359139141207065)]

In [391]:
for column in sp_wd_corr[sp_wd_corr > .5].keys():
    score = get_r_scores(df_with_dummies, [column], target='SalePrice')
    if  score < 1:
        scores[column] = score

# https://stackoverflow.com/a/20948781/2548452
s = [(k, scores[k]) for k in sorted(scores, key=scores.get, reverse=True)]
s

Overall Qual r2_score: 0.6403310352003229
Year Built r2_score: 0.3270111597575761
Year Remod/Add r2_score: 0.30290675034176306
Mas Vnr Area r2_score: 0.25359139141207065
Total Bsmt SF r2_score: 0.39602238690537106
1st Flr SF r2_score: 0.38252510667511996
Gr Liv Area r2_score: 0.48586259284005934
Full Bath r2_score: 0.2894107548273267
TotRms AbvGrd r2_score: 0.254030432823128
Garage Cars r2_score: 0.4196206721721161
Garage Area r2_score: 0.42236591520046085
SalePrice r2_score: 1.0
Foundation_PConc r2_score: 0.27989057265271455
Bsmt Qual_Ex r2_score: 0.34397900016466043
Kitchen Qual_Ex r2_score: 0.30391459442452795


[('Overall Qual', 0.6403310352003229),
 ('Gr Liv Area', 0.48586259284005934),
 ('Garage Area', 0.42236591520046085),
 ('Garage Cars', 0.4196206721721161),
 ('Total Bsmt SF', 0.39602238690537106),
 ('1st Flr SF', 0.38252510667511996),
 ('Bsmt Qual_Ex', 0.34397900016466043),
 ('Year Built', 0.3270111597575761),
 ('Kitchen Qual_Ex', 0.30391459442452795),
 ('Year Remod/Add', 0.30290675034176306),
 ('Full Bath', 0.2894107548273267),
 ('Foundation_PConc', 0.27989057265271455),
 ('TotRms AbvGrd', 0.254030432823128),
 ('Mas Vnr Area', 0.25359139141207065)]

In [125]:
get_r_scores(df_with_dummies, ['Overall Qual','Year Built','Year Remod/Add','Mas Vnr Area','Total Bsmt SF','1st Flr SF','Gr Liv Area'], target="SalePrice")

Overall Qual, Year Built, Year Remod/Add, Mas Vnr Area, Total Bsmt SF, 1st Flr SF, Gr Liv Area r2_score: 0.7871655967651169


0.7871655967651169

In [523]:
lr = LinearRegression()

In [563]:
removed = ['MS Zoning_A (agr)', 'Utilities_NoSeWa', 'Neighborhood_GrnHill','Neighborhood_Landmrk', 'Condition 2_Artery', 'Condition 2_RRAe','Condition 2_RRAn', 'Condition 2_RRNn', 'Roof Matl_ClyTile','Roof Matl_Membran', 'Exterior 1st_CBlock', 'Exterior 1st_ImStucc','Exterior 1st_Stone', 'Exterior 2nd_Stone', 'Bsmt Cond_Ex', 'Bsmt Cond_Po','Heating_OthW', 'Heating_Wall', 'Heating QC_Po', 'Electrical_Mix','Functional_Sal', 'Functional_Sev', 'Garage Yr Blt_1895.0','Garage Yr Blt_1896.0', 'Garage Yr Blt_1914.0', 'Garage Yr Blt_1919.0','Garage Yr Blt_1929.0', 'Garage Yr Blt_1933.0', 'Garage Yr Blt_1936.0','Garage Yr Blt_1942.0', 'Garage Yr Blt_1945.0', 'Garage Yr Blt_2207.0','Garage Qual_Ex', 'Pool QC_Fa', 'Pool QC_Gd', 'Misc Feature_Elev','Misc Feature_TenC','Misc Feature_Elev', 'Misc Feature_Gar2', 'Misc Feature_NA', 'Misc Feature_Othr', 'Misc Feature_Shed', 'Misc Feature_TenC','MS SubClass', 'Fence_GdPrv', 'Fence_GdWo', 'Fence_MnPrv', 'Fence_MnWw', 'Fence_NA','Pool QC_Ex', 'Pool QC_Fa', 'Pool QC_Gd', 'Pool QC_NA', 'Pool QC_TA','Garage Yr Blt_1895.0', 'Garage Yr Blt_1896.0', 'Garage Yr Blt_1900.0', 'Garage Yr Blt_1910.0', 'Garage Yr Blt_1914.0', 'Garage Yr Blt_1915.0', 'Garage Yr Blt_1916.0', 'Garage Yr Blt_1917.0', 'Garage Yr Blt_1918.0', 'Garage Yr Blt_1919.0', 'Garage Yr Blt_1920.0', 'Garage Yr Blt_1921.0', 'Garage Yr Blt_1922.0', 'Garage Yr Blt_1923.0', 'Garage Yr Blt_1924.0', 'Garage Yr Blt_1925.0', 'Garage Yr Blt_1926.0', 'Garage Yr Blt_1927.0', 'Garage Yr Blt_1928.0', 'Garage Yr Blt_1929.0', 'Garage Yr Blt_1930.0', 'Garage Yr Blt_1931.0', 'Garage Yr Blt_1932.0', 'Garage Yr Blt_1933.0', 'Garage Yr Blt_1934.0', 'Garage Yr Blt_1935.0', 'Garage Yr Blt_1936.0', 'Garage Yr Blt_1937.0', 'Garage Yr Blt_1938.0', 'Garage Yr Blt_1939.0', 'Garage Yr Blt_1940.0', 'Garage Yr Blt_1941.0', 'Garage Yr Blt_1942.0', 'Garage Yr Blt_1945.0', 'Garage Yr Blt_1946.0', 'Garage Yr Blt_1947.0', 'Garage Yr Blt_1948.0', 'Garage Yr Blt_1949.0', 'Garage Yr Blt_1950.0', 'Garage Yr Blt_1951.0', 'Garage Yr Blt_1952.0', 'Garage Yr Blt_1953.0', 'Garage Yr Blt_1954.0', 'Garage Yr Blt_1955.0', 'Garage Yr Blt_1956.0', 'Garage Yr Blt_1957.0', 'Garage Yr Blt_1958.0', 'Garage Yr Blt_1959.0', 'Garage Yr Blt_1960.0', 'Garage Yr Blt_1961.0', 'Garage Yr Blt_1962.0', 'Garage Yr Blt_1963.0', 'Garage Yr Blt_1964.0', 'Garage Yr Blt_1965.0', 'Garage Yr Blt_1966.0', 'Garage Yr Blt_1967.0', 'Garage Yr Blt_1968.0', 'Garage Yr Blt_1969.0', 'Garage Yr Blt_1970.0', 'Garage Yr Blt_1971.0', 'Garage Yr Blt_1972.0', 'Garage Yr Blt_1973.0', 'Garage Yr Blt_1974.0', 'Garage Yr Blt_1975.0', 'Garage Yr Blt_1976.0', 'Garage Yr Blt_1977.0', 'Garage Yr Blt_1978.0', 'Garage Yr Blt_1979.0', 'Garage Yr Blt_1980.0', 'Garage Yr Blt_1981.0', 'Garage Yr Blt_1982.0', 'Garage Yr Blt_1983.0', 'Garage Yr Blt_1984.0', 'Garage Yr Blt_1985.0', 'Garage Yr Blt_1986.0', 'Garage Yr Blt_1987.0', 'Garage Yr Blt_1988.0', 'Garage Yr Blt_1989.0', 'Garage Yr Blt_1990.0', 'Garage Yr Blt_1991.0', 'Garage Yr Blt_1992.0', 'Garage Yr Blt_1993.0', 'Garage Yr Blt_1994.0', 'Garage Yr Blt_1995.0', 'Garage Yr Blt_1996.0', 'Garage Yr Blt_1997.0', 'Garage Yr Blt_1998.0', 'Garage Yr Blt_1999.0', 'Garage Yr Blt_2000.0', 'Garage Yr Blt_2001.0', 'Garage Yr Blt_2002.0', 'Garage Yr Blt_2003.0', 'Garage Yr Blt_2004.0', 'Garage Yr Blt_2005.0', 'Garage Yr Blt_2006.0', 'Garage Yr Blt_2007.0', 'Garage Yr Blt_2008.0', 'Garage Yr Blt_2009.0', 'Garage Yr Blt_2010.0', 'Garage Yr Blt_2207.0','BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF','Bsmt Full Bath', 'Bsmt Half Bath', ] 
all_features = ['Id', 'PID','Pool Area','Year Built', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Remod/Add', 'Mas Vnr Area',  'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch',  'Misc Val', 'Mo Sold', 'Yr Sold','Sale Type_WD ','Sale Type_COD','Sale Type_CWD', 'Sale Type_Con', 'Sale Type_ConLD', 'Sale Type_ConLI', 'Sale Type_ConLw', 'Sale Type_New', 'Sale Type_Oth','MS Zoning_A (agr)', 'MS Zoning_C (all)', 'MS Zoning_FV', 'MS Zoning_I (all)', 'MS Zoning_RH', 'MS Zoning_RL', 'MS Zoning_RM','Street_Grvl', 'Street_Pave', 'Alley_Grvl', 'Alley_NA', 'Alley_Pave', 'Lot Shape_IR1', 'Lot Shape_IR2', 'Lot Shape_IR3', 'Lot Shape_Reg', 'Land Contour_Bnk', 'Land Contour_HLS', 'Land Contour_Low', 'Land Contour_Lvl', 'Utilities_AllPub', 'Utilities_NoSeWa', 'Utilities_NoSewr', 'Lot Config_Corner', 'Lot Config_CulDSac', 'Lot Config_FR2', 'Lot Config_FR3', 'Lot Config_Inside', 'Land Slope_Gtl', 'Land Slope_Mod', 'Land Slope_Sev', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_Greens', 'Neighborhood_GrnHill', 'Neighborhood_IDOTRR', 'Neighborhood_Landmrk', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Condition 1_Artery', 'Condition 1_Feedr', 'Condition 1_Norm', 'Condition 1_PosA', 'Condition 1_PosN', 'Condition 1_RRAe', 'Condition 1_RRAn', 'Condition 1_RRNe', 'Condition 1_RRNn', 'Condition 2_Artery', 'Condition 2_Feedr', 'Condition 2_Norm', 'Condition 2_PosA', 'Condition 2_PosN', 'Condition 2_RRAe', 'Condition 2_RRAn', 'Condition 2_RRNn', 'Bldg Type_1Fam', 'Bldg Type_2fmCon', 'Bldg Type_Duplex', 'Bldg Type_Twnhs', 'Bldg Type_TwnhsE', 'House Style_1.5Fin', 'House Style_1.5Unf', 'House Style_1Story', 'House Style_2.5Fin', 'House Style_2.5Unf', 'House Style_2Story', 'House Style_SFoyer', 'House Style_SLvl', 'Roof Style_Flat', 'Roof Style_Gable', 'Roof Style_Gambrel', 'Roof Style_Hip', 'Roof Style_Mansard', 'Roof Style_Shed', 'Roof Matl_ClyTile', 'Roof Matl_CompShg', 'Roof Matl_Membran', 'Roof Matl_Tar&Grv', 'Roof Matl_WdShake', 'Roof Matl_WdShngl', 'Exterior 1st_AsbShng', 'Exterior 1st_AsphShn', 'Exterior 1st_BrkComm', 'Exterior 1st_BrkFace', 'Exterior 1st_CBlock', 'Exterior 1st_CemntBd', 'Exterior 1st_HdBoard', 'Exterior 1st_ImStucc', 'Exterior 1st_MetalSd', 'Exterior 1st_Plywood', 'Exterior 1st_Stone', 'Exterior 1st_Stucco', 'Exterior 1st_VinylSd', 'Exterior 1st_Wd Sdng', 'Exterior 1st_WdShing', 'Exterior 2nd_AsbShng', 'Exterior 2nd_AsphShn', 'Exterior 2nd_Brk Cmn', 'Exterior 2nd_BrkFace', 'Exterior 2nd_CBlock', 'Exterior 2nd_CmentBd', 'Exterior 2nd_HdBoard', 'Exterior 2nd_ImStucc', 'Exterior 2nd_MetalSd', 'Exterior 2nd_Plywood', 'Exterior 2nd_Stone', 'Exterior 2nd_Stucco', 'Exterior 2nd_VinylSd', 'Exterior 2nd_Wd Sdng', 'Exterior 2nd_Wd Shng', 'Mas Vnr Type_BrkCmn', 'Mas Vnr Type_BrkFace', 'Mas Vnr Type_NA', 'Mas Vnr Type_None', 'Mas Vnr Type_Stone', 'Exter Qual_Ex', 'Exter Qual_Fa', 'Exter Qual_Gd', 'Exter Qual_TA', 'Exter Cond_Ex', 'Exter Cond_Fa', 'Exter Cond_Gd', 'Exter Cond_Po', 'Exter Cond_TA', 'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood', 'Bsmt Qual_Ex', 'Bsmt Qual_Fa', 'Bsmt Qual_Gd', 'Bsmt Qual_NA', 'Bsmt Qual_Po', 'Bsmt Qual_TA', 'Bsmt Cond_Ex', 'Bsmt Cond_Fa', 'Bsmt Cond_Gd', 'Bsmt Cond_NA', 'Bsmt Cond_Po', 'Bsmt Cond_TA', 'Bsmt Exposure_Av', 'Bsmt Exposure_Gd', 'Bsmt Exposure_Mn', 'Bsmt Exposure_NA', 'Bsmt Exposure_No', 'BsmtFin Type 1_ALQ', 'BsmtFin Type 1_BLQ', 'BsmtFin Type 1_GLQ', 'BsmtFin Type 1_LwQ', 'BsmtFin Type 1_NA', 'BsmtFin Type 1_Rec', 'BsmtFin Type 1_Unf', 'BsmtFin Type 2_ALQ', 'BsmtFin Type 2_BLQ', 'BsmtFin Type 2_GLQ', 'BsmtFin Type 2_LwQ', 'BsmtFin Type 2_NA', 'BsmtFin Type 2_Rec', 'BsmtFin Type 2_Unf', 'Heating_GasA', 'Heating_GasW', 'Heating_Grav', 'Heating_OthW', 'Heating_Wall', 'Heating QC_Ex', 'Heating QC_Fa', 'Heating QC_Gd', 'Heating QC_Po', 'Heating QC_TA', 'Central Air_N', 'Central Air_Y', 'Electrical_FuseA', 'Electrical_FuseF', 'Electrical_FuseP', 'Electrical_Mix', 'Electrical_SBrkr', 'Kitchen Qual_Ex', 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA', 'Functional_Maj1', 'Functional_Maj2', 'Functional_Min1', 'Functional_Min2', 'Functional_Mod', 'Functional_Sal', 'Functional_Sev', 'Functional_Typ', 'Fireplace Qu_Ex', 'Fireplace Qu_Fa', 'Fireplace Qu_Gd', 'Fireplace Qu_NA', 'Fireplace Qu_Po', 'Fireplace Qu_TA', 'Garage Type_2Types', 'Garage Type_Attchd', 'Garage Type_Basment', 'Garage Type_BuiltIn', 'Garage Type_CarPort', 'Garage Type_Detchd', 'Garage Type_NA',  'Garage Yr Blt_NA', 'Garage Finish_Fin', 'Garage Finish_NA', 'Garage Finish_RFn', 'Garage Finish_Unf', 'Garage Qual_Ex', 'Garage Qual_Fa', 'Garage Qual_Gd', 'Garage Qual_NA', 'Garage Qual_Po', 'Garage Qual_TA', 'Garage Cond_Ex', 'Garage Cond_Fa', 'Garage Cond_Gd', 'Garage Cond_NA', 'Garage Cond_Po', 'Garage Cond_TA', 'Paved Drive_N', 'Paved Drive_P', 'Paved Drive_Y',]
cum_columns = []

def get_all_scores(column):
    X = df_with_dummies[column]
    y = df_with_dummies['SalePrice']
    X_train,X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(lr, X_train, y_train, cv=kf)
    if scores.mean() > .0:
        print(f"{column} score: {scores.mean()}")
        cum_columns.append(column[0])


        
for column in all_features:
    get_all_scores([column])

print(len(cum_columns))
print(len(all_features))
print(len(cum_columns) == len(all_features))
    
get_all_scores(cum_columns)

['PID'] score: 0.06166466832075255
['Year Built'] score: 0.31096176711654994
['Lot Frontage'] score: 0.02183930882506886
['Lot Area'] score: 0.09959312926392964
['Overall Qual'] score: 0.6323404401002144
['Overall Cond'] score: 0.000761730298327068
['Year Remod/Add'] score: 0.29407707577336684
['Mas Vnr Area'] score: 0.2593470887937806
['Total Bsmt SF'] score: 0.3572504825895565
['1st Flr SF'] score: 0.346219005622826
['2nd Flr SF'] score: 0.05699853271146547
['Gr Liv Area'] score: 0.46583119617471314
['Full Bath'] score: 0.28004458555095013
['Half Bath'] score: 0.07028209160251395
['Bedroom AbvGr'] score: 0.009743018594787
['Kitchen AbvGr'] score: 0.005461697807779398
['TotRms AbvGrd'] score: 0.2651453191160885
['Fireplaces'] score: 0.2172376514544402
['Garage Cars'] score: 0.4179820273885719
['Garage Area'] score: 0.4065826594651907
['Wood Deck SF'] score: 0.08769797429872526
['Open Porch SF'] score: 0.0995420638352464
['Enclosed Porch'] score: 0.009189757684276
['Screen Porch'] scor

In [564]:
all_features = ['PID', 'Year Built', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Remod/Add', 'Mas Vnr Area', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', 'Screen Porch', 'Sale Type_WD ', 'Sale Type_COD', 'Sale Type_New', 'MS Zoning_C (all)', 'MS Zoning_FV', 'MS Zoning_RL', 'MS Zoning_RM', 'Alley_Grvl', 'Alley_NA', 'Lot Shape_IR1', 'Lot Shape_IR2', 'Lot Shape_Reg', 'Land Contour_Bnk', 'Land Contour_HLS', 'Lot Config_CulDSac', 'Lot Config_Inside', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_Edwards', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_NAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_Sawyer', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Condition 1_Artery', 'Condition 1_Feedr', 'Condition 1_Norm', 'Condition 2_PosA', 'Bldg Type_1Fam', 'Bldg Type_2fmCon', 'Bldg Type_Duplex', 'Bldg Type_Twnhs', 'House Style_1.5Fin', 'House Style_2Story', 'Roof Style_Gable', 'Roof Style_Hip', 'Roof Matl_WdShngl', 'Exterior 1st_AsbShng', 'Exterior 1st_CemntBd', 'Exterior 1st_HdBoard', 'Exterior 1st_MetalSd', 'Exterior 1st_VinylSd', 'Exterior 1st_Wd Sdng', 'Exterior 2nd_AsbShng', 'Exterior 2nd_CmentBd', 'Exterior 2nd_HdBoard', 'Exterior 2nd_MetalSd', 'Exterior 2nd_VinylSd', 'Exterior 2nd_Wd Sdng', 'Mas Vnr Type_BrkFace', 'Mas Vnr Type_None', 'Mas Vnr Type_Stone', 'Exter Qual_Ex', 'Exter Qual_Fa', 'Exter Qual_Gd', 'Exter Qual_TA', 'Exter Cond_Fa', 'Exter Cond_TA', 'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Bsmt Qual_Ex', 'Bsmt Qual_Fa', 'Bsmt Qual_Gd', 'Bsmt Qual_NA', 'Bsmt Qual_TA', 'Bsmt Cond_Fa', 'Bsmt Cond_Gd', 'Bsmt Cond_NA', 'Bsmt Exposure_Av', 'Bsmt Exposure_Gd', 'Bsmt Exposure_NA', 'Bsmt Exposure_No', 'BsmtFin Type 1_BLQ', 'BsmtFin Type 1_GLQ', 'BsmtFin Type 1_NA', 'BsmtFin Type 1_Rec', 'BsmtFin Type 1_Unf', 'BsmtFin Type 2_NA', 'BsmtFin Type 2_Unf', 'Heating_GasA', 'Heating QC_Ex', 'Heating QC_Fa', 'Heating QC_Gd', 'Heating QC_TA', 'Central Air_N', 'Central Air_Y', 'Electrical_FuseA', 'Electrical_FuseF', 'Electrical_SBrkr', 'Kitchen Qual_Ex', 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA', 'Functional_Typ', 'Fireplace Qu_Ex', 'Fireplace Qu_Gd', 'Fireplace Qu_NA', 'Fireplace Qu_TA', 'Garage Type_Attchd', 'Garage Type_BuiltIn', 'Garage Type_Detchd', 'Garage Type_NA', 'Garage Yr Blt_NA', 'Garage Finish_Fin', 'Garage Finish_NA', 'Garage Finish_RFn', 'Garage Finish_Unf', 'Garage Qual_Fa', 'Garage Qual_NA', 'Garage Qual_TA', 'Garage Cond_Fa', 'Garage Cond_NA', 'Garage Cond_TA', 'Paved Drive_N', 'Paved Drive_P', 'Paved Drive_Y']

for column in (set(removed) & set(all_features)):
    all_features.remove(column)

best_features = ['Overall Qual','Year Built','Year Remod/Add','Mas Vnr Area','Total Bsmt SF','1st Flr SF','Gr Liv Area']

X = df_with_dummies[all_features]
y = df_with_dummies['SalePrice']


X_train,X_test, y_train, y_test = train_test_split(X, y, random_state=42)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(lr, X_train, y_train, cv=kf)
print(scores)
print(scores.mean())

[0.8956168  0.9016187  0.61531086 0.89172457 0.89266766 0.90466114
 0.86944902 0.78830428 0.90752887 0.87990853]
0.854679042862229


In [565]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)
lr.fit(X_train, y_train)
print(lr.score(X_test, y_test))
lr.fit(X_train_scaled, y_train)
lr.score(X_test_scaled, y_test)
mean_squared_error(y_test, lr.predict(X_test))
all_features_score = mean_squared_error(y_test, lr.predict(X_test_scaled))

0.8994878220295363


In [566]:
clf = Lasso(alpha=0.1,normalize=True, max_iter=1e5)
clf.fit(X_train_scaled,y_train)
clf.score(X_test_scaled,y_test)
mean_squared_error(y_test, clf.predict(X_test_scaled))

613416016.2462058

In [567]:
rr = Ridge(alpha=0.1,max_iter=1e5)
rr.fit(X_train_scaled,y_train)
rr.score(X_test_scaled,y_test)
mean_squared_error(y_test, rr.predict(X_test_scaled))

613157041.1885523

In [568]:
en = ElasticNet(alpha=0.1,max_iter=1e5)
en.fit(X_train_scaled,y_train)
en.score(X_test_scaled,y_test)
mean_squared_error(y_test, en.predict(X_test_scaled))

604003243.0770907

In [562]:
# https://www.kaggle.com/dansbecker/submitting-from-a-kernel
# Read the test data
test = pd.read_csv('./test.csv')
clean_data(test)
test_with_dummies = pd.get_dummies(test)
# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test_with_dummies[all_features]
scaled_test_X = ss.fit_transform(test_X)


# # Use the model to make predictions
predicted_prices = en.predict(scaled_test_X)
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})

file_name = './submissions/ninth_submission.csv'
my_submission.to_csv(file_name, index=False)

In [70]:
import subprocess, webbrowser
result = subprocess.check_output(f'kaggle competitions submit -f {file_name} -m "uploading a new set" dsi-us-5-project-2-regression-challenge')
if result == b'Successfully submitted to DSI-US-5 Project 2 Regression Challenge':
    webbrowser.open("https://www.kaggle.com/c/dsi-us-5-project-2-regression-challenge/leaderboard")