In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [59]:
# check for nulls
print(df.shape)
print()

   
null_counts = df.isnull().sum()

null_columns_with_counts = {}

for key,value in null_counts.items():
    if value > 0:
        null_columns_with_counts[key] = {"count": value, "type": str(df[key].dtype)}

# 'Lot Frontage' => Set to 0
# 'Alley' => Set to "None"
# 'Mas Vnr Type' => Set to "None"
# 'Mas Vnr Area' => Set to 0
# 'Bsmt Qual' => Set to "NA"
# 'Bsmt Cond' => Set to "NA"
# 'Bsmt Exposure' => Set to "NA"
# 'BsmtFin Type 1' => Set to "NA"
# 'BsmtFin SF 1' => Set to 0
# 'BsmtFin Type 2' => Set to "NA"
# 'BsmtFin SF 2' => Set to 0
# 'Bsmt Unf SF' => Set to 0
# 'Bsmt Unf SF' => Set to 0
# 'Total Bsmt SF' => Set to 0
# 'Bsmt Full Bath' => Set to 0
# 'Bsmt Half Bath' => Set to 0
# 'Fireplace Qu' => Set to "NA"
# 'Garage Type' => Set to "NA"
# 'Garage Yr Blt' => Map to string, Set to "NA"
# 'Garage Finish' => Set to "NA"
# 'Garage Cars' => Drop it
# 'Garage Area' => Drop it
# 'Garage Qual' => Set to "NA"
# 'Garage Cond' => Set to "NA"
# 'Pool QC' => Set to "NA"
# 'Fence' => Set to "NA"
# 'Misc Feature' => Set to "NA"
null_columns_with_counts

(2051, 81)



{'Lot Frontage': {'count': 330, 'type': 'float64'},
 'Alley': {'count': 1911, 'type': 'object'},
 'Mas Vnr Type': {'count': 22, 'type': 'object'},
 'Mas Vnr Area': {'count': 22, 'type': 'float64'},
 'Bsmt Qual': {'count': 55, 'type': 'object'},
 'Bsmt Cond': {'count': 55, 'type': 'object'},
 'Bsmt Exposure': {'count': 58, 'type': 'object'},
 'BsmtFin Type 1': {'count': 55, 'type': 'object'},
 'BsmtFin SF 1': {'count': 1, 'type': 'float64'},
 'BsmtFin Type 2': {'count': 56, 'type': 'object'},
 'BsmtFin SF 2': {'count': 1, 'type': 'float64'},
 'Bsmt Unf SF': {'count': 1, 'type': 'float64'},
 'Total Bsmt SF': {'count': 1, 'type': 'float64'},
 'Bsmt Full Bath': {'count': 2, 'type': 'float64'},
 'Bsmt Half Bath': {'count': 2, 'type': 'float64'},
 'Fireplace Qu': {'count': 1000, 'type': 'object'},
 'Garage Type': {'count': 113, 'type': 'object'},
 'Garage Yr Blt': {'count': 114, 'type': 'float64'},
 'Garage Finish': {'count': 114, 'type': 'object'},
 'Garage Cars': {'count': 1, 'type': 'floa

In [94]:
def clean_data(data_frame):
    # loop through columns with null data
    for column in ['Lot Frontage','Alley','Mas Vnr Type','Mas Vnr Area','Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin SF 1','BsmtFin Type 2','BsmtFin SF 2','Bsmt Unf SF','Bsmt Unf SF','Total Bsmt SF','Bsmt Full Bath','Bsmt Half Bath','Fireplace Qu','Garage Type','Garage Yr Blt','Garage Finish','Garage Cars','Garage Area','Garage Qual','Garage Cond','Fence','Misc Feature', 'Pool QC']:
        if column in ['Lot Frontage','Mas Vnr Area','BsmtFin SF 1','BsmtFin SF 2','Bsmt Unf SF','Bsmt Unf SF','Total Bsmt SF','Bsmt Full Bath','Bsmt Half Bath','Garage Cars', 'Garage Area']:
            data_frame[column].fillna(0, inplace=True)
        elif column in ['Alley','Mas Vnr Type','Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2','Fireplace Qu','Garage Type','Garage Finish','Garage Qual','Garage Cond','Fence','Misc Feature','Pool QC']:
            data_frame[column].fillna("NA", inplace=True)
        elif column == 'Garage Yr Blt':
            data_frame[column].fillna("NA", inplace=True)
            data_frame[column] = data_frame[column].map(lambda x: str(x))

clean_data(df)

In [97]:
df_with_dummies = pd.get_dummies(df)

In [112]:
sp_corr = df_with_dummies.corr()['SalePrice']

df_with_dummies[sp_corr[sp_corr > .5].keys()].head()



Unnamed: 0,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,Total Bsmt SF,1st Flr SF,Gr Liv Area,Full Bath,TotRms AbvGrd,Garage Cars,Garage Area,SalePrice,Foundation_PConc,Bsmt Qual_Ex,Kitchen Qual_Ex
0,6,1976,2005,289.0,725.0,725,1479,2,6,2.0,475.0,130500,0,0,0
1,7,1996,1997,132.0,913.0,913,2122,2,8,2.0,559.0,220000,1,0,0
2,5,1953,2007,0.0,1057.0,1057,1057,1,5,1.0,246.0,109000,0,0,0
3,5,2006,2007,0.0,384.0,744,1444,2,7,2.0,400.0,174000,1,0,0
4,6,1900,1993,0.0,676.0,831,1445,2,6,2.0,484.0,138500,1,0,0


In [125]:
from sklearn import linear_model

def get_r_scores(data_frame, column_name, target='price'):
    lm = linear_model.LinearRegression()
    local_fit = lm.fit(data_frame[column_name], data_frame[[target]])
    score = local_fit.score(data_frame[column_name], data_frame[[target]])
    print(f"{', '.join(column_name)} r2_score: {score}")
    return score

scores = {}

for column in sp_corr[sp_corr > .5].keys():
    score = get_r_scores(df_with_dummies, [column], target='SalePrice')
    if  score < 1:
        scores[column] = score

# https://stackoverflow.com/a/20948781/2548452
s = [(k, scores[k]) for k in sorted(scores, key=scores.get, reverse=True)]
s

Overall Qual r2_score: 0.6403310352003229
Year Built r2_score: 0.3270111597575761
Year Remod/Add r2_score: 0.30290675034176306
Mas Vnr Area r2_score: 0.25359139141207065
Total Bsmt SF r2_score: 0.39602238690537106
1st Flr SF r2_score: 0.38252510667511996
Gr Liv Area r2_score: 0.48586259284005934
Full Bath r2_score: 0.2894107548273267
TotRms AbvGrd r2_score: 0.254030432823128
Garage Cars r2_score: 0.4196206721721161
Garage Area r2_score: 0.42236591520046085
SalePrice r2_score: 1.0
Foundation_PConc r2_score: 0.27989057265271455
Bsmt Qual_Ex r2_score: 0.34397900016466043
Kitchen Qual_Ex r2_score: 0.30391459442452795


[('Overall Qual', 0.6403310352003229),
 ('Gr Liv Area', 0.48586259284005934),
 ('Garage Area', 0.42236591520046085),
 ('Garage Cars', 0.4196206721721161),
 ('Total Bsmt SF', 0.39602238690537106),
 ('1st Flr SF', 0.38252510667511996),
 ('Bsmt Qual_Ex', 0.34397900016466043),
 ('Year Built', 0.3270111597575761),
 ('Kitchen Qual_Ex', 0.30391459442452795),
 ('Year Remod/Add', 0.30290675034176306),
 ('Full Bath', 0.2894107548273267),
 ('Foundation_PConc', 0.27989057265271455),
 ('TotRms AbvGrd', 0.254030432823128),
 ('Mas Vnr Area', 0.25359139141207065)]

In [143]:
get_r_scores(df_with_dummies, ['Overall Qual','Gr Liv Area','Garage Area','Garage Cars','Total Bsmt SF','Foundation_PConc','Year Built'],target="SalePrice")


Overall Qual, Gr Liv Area, Garage Area, Garage Cars, Total Bsmt SF, Foundation_PConc, Year Built r2_score: 0.785654193560582


0.785654193560582

In [144]:
lr = LinearRegression()

In [182]:
features = ['Overall Qual','Gr Liv Area','Garage Area','Garage Cars','Total Bsmt SF','Foundation_PConc','Year Built']
X = df_with_dummies[features]
y = df_with_dummies['SalePrice']

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [178]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)
scores = cross_val_score(lr, X_train, y_train, cv=kf)
scores.mean()

0.7394256069454556

In [181]:
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.7931501013915809

In [None]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()



In [180]:
# https://www.kaggle.com/dansbecker/submitting-from-a-kernel
# Read the test data
test = pd.read_csv('./test.csv')
clean_data(test)
test_with_dummies = pd.get_dummies(test)
# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test_with_dummies[features]
# # Use the model to make predictions
predicted_prices = lr.predict(test_X)
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
# my_submission.to_csv('second_submission.csv', index=False)