In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [3]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')

In [4]:
train.shape, test.shape

((2051, 81), (878, 80))

In [5]:
train.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [6]:
len(train['Neighborhood'].unique())

28

In [7]:
len(test['Neighborhood'].unique())

26

In [8]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [9]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [11]:
train['Lot Frontage'].isnull().sum()

330

In [12]:
test['Lot Frontage'].isnull().sum()

160

In [13]:
train['Lot Frontage'] = train['Lot Frontage'].fillna(train['Lot Frontage'].mean())

In [14]:
train['Lot Frontage'].isnull().sum()

0

In [15]:
#  filling testing data with train mean, 
# cause we want the test data to allign with the  training set as much as possible
test['Lot Frontage'] = test['Lot Frontage'].fillna(train['Lot Frontage'].mean())
test['Lot Frontage'].isnull().sum()

0

Fit model on training data:

In [19]:
X = train[['Overall Qual', 'Lot Frontage']]
y = train['SalePrice']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 11)

In [23]:
demo = LinearRegression()

In [26]:
demo.fit(X_train, y_train)

LinearRegression()

In [27]:
demo.score(X_train, y_train), demo.score(X_test, y_test)

(0.668783195462278, 0.6811008350432646)

In [28]:
kaggle_preds = demo.predict(test[['Overall Qual', 'Lot Frontage']])

In [29]:
kaggle_preds[:5]

array([176595.79225559, 135282.32389481, 211066.24274027, 129620.74019438,
       176630.30524658])

In [30]:
kaggle_output = {'Id': test['Id'], 'SalePrice': kaggle_preds}

In [31]:
kaggle_output

{'Id': 0      2658
 1      2718
 2      2414
 3      1989
 4       625
        ... 
 873    1662
 874    1234
 875    1373
 876    1672
 877    1939
 Name: Id, Length: 878, dtype: int64,
 'SalePrice': array([ 1.76595792e+05,  1.35282324e+05,  2.11066243e+05,  1.29620740e+05,
         1.76630305e+05,  6.38887858e+04,  8.32709182e+04,  1.35282324e+05,
         1.99186871e+05,  1.80347173e+05,  1.64716421e+05,  1.70968722e+05,
         1.80347173e+05,  2.17978287e+05,  1.76595792e+05,  1.29620740e+05,
         1.70968722e+05,  1.34622581e+05,  1.83473323e+05,  2.91803719e+05,
         1.35282324e+05,  1.35282324e+05,  1.64716421e+05,  1.76630305e+05,
         2.24821304e+05,  1.29620740e+05,  9.39343425e+04,  1.23368439e+05,
         1.83473323e+05,  4.12977068e+04,  9.39343425e+04,  8.32709182e+04,
         2.15442853e+05,  1.42125342e+05,  2.21695154e+05,  2.11066243e+05,
         8.82727588e+04,  9.39343425e+04,  1.38373961e+05,  2.20444694e+05,
         1.58464120e+05,  2.17978287e+05

In [32]:
dataframe = pd.DataFrame(kaggle_output)

In [33]:
dataframe.head()

Unnamed: 0,Id,SalePrice
0,2658,176595.792256
1,2718,135282.323895
2,2414,211066.24274
3,1989,129620.740194
4,625,176630.305247


In [34]:
dataframe.to_csv('demo_submission.csv', index = False)

In [None]:
#  download the csv file on the computer and upload in the kaggle comp