# Housing Prices

### Import Libraries

In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler, normalize

### Analysing the Data

In [2]:
df = pd.read_csv('Data/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
#columns Name
df.columns.values

array(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'Pav

In [4]:
#data shape or size 
df.shape

(1460, 81)

In [14]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,57.623288,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,34.664304,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,0.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,42.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,63.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


### Data preprocessing

#### Handle the null rows

In [5]:
#check the null rows
pd.DataFrame(df.isnull().sum())

Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotFrontage,259
LotArea,0
...,...
MoSold,0
YrSold,0
SaleType,0
SaleCondition,0


In [6]:
df.fillna(0, inplace=True)

In [7]:
pd.DataFrame(df.isnull().sum())

Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotFrontage,0
LotArea,0
...,...
MoSold,0
YrSold,0
SaleType,0
SaleCondition,0


#### Data Spliting

In [23]:
#transform string data to binary 0 or 1
tdf = pd.get_dummies(df, dtype=float)
X = np.asanyarray(tdf.drop(["SalePrice"], axis=1))
Y = np.asanyarray(tdf.SalePrice)

In [26]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) 

#### Scaling data

In [67]:
scale_std = StandardScaler()
x_train_scaled = scale_std.fit_transform(pd.DataFrame(x_train))
y_train_scaled = scale_std.fit_transform(pd.DataFrame(y_train)) 


x_train_scaled = normalize(x_train_scaled)
y_train_scaled = normalize(y_train_scaled) 


In [68]:
pd.DataFrame(x_train_scaled).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,294,295,296,297,298,299,300,301,302,303
count,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,...,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0
mean,-1e-05,-0.005332,0.00071,-0.003163,0.003514,-0.001374,0.009327,0.005932,0.001495,-0.00014,...,-0.001694,0.000188,-0.001373,0.002339,-0.003003,-0.002236,-0.002362,-0.001173,0.003067,0.000135
std,0.074537,0.066529,0.068942,0.040589,0.067649,0.064614,0.068414,0.070907,0.071026,0.066858,...,0.039977,0.073552,0.027268,0.068463,0.064175,0.036042,0.03839,0.063388,0.068773,0.073392
min,-0.204781,-0.110479,-0.1796,-0.063293,-0.208092,-0.189334,-0.174829,-0.178668,-0.077371,-0.119866,...,-0.006899,-0.038047,-0.003979,-0.270215,-0.033944,-0.00797,-0.008915,-0.016536,-0.233008,-0.038293
25%,-0.056422,-0.060285,-0.031656,-0.020071,-0.050488,-0.042897,-0.036973,-0.055937,-0.042882,-0.055871,...,-0.004651,-0.025158,-0.002683,0.018093,-0.022709,-0.005373,-0.00601,-0.011148,0.0194,-0.02532
50%,0.000448,-0.007995,0.008397,-0.006646,-0.00629,-0.030724,0.003117,0.024541,-0.027805,-0.008239,...,-0.003869,-0.020599,-0.002231,0.025938,-0.018791,-0.004469,-0.004999,-0.009254,0.03061,-0.020732
75%,0.055287,0.016278,0.043578,0.005511,0.053131,0.03195,0.074869,0.067987,0.027229,0.04753,...,-0.003054,-0.015092,-0.001762,0.031952,-0.014167,-0.003528,-0.003947,-0.007197,0.038327,-0.015174
max,0.199367,0.237668,0.271934,0.559733,0.201829,0.296885,0.140909,0.133214,0.433238,0.233559,...,0.744698,0.343195,0.869224,0.048323,0.402165,0.700641,0.629489,0.614412,0.058586,0.340988


### Training the model

In [None]:
reg = SGDRegressor()
p_grids = {
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],    
    'penalty': ['l2', 'l1', 'elasticnet'],
     'max_iter': [1000, 5000, 10000],
}
model = GridSearchCV(estimator=reg, param_grid=p_grids)
model.fit(x_train_scaled ,y_train_scaled.ravel())



In [None]:
model.predict(x_test)

In [None]:
model.best_score_