In [186]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [189]:
df = pd.read_csv("AMES_Final_DF.csv")

In [190]:
df.head()

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,...,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,0,0,0,0,1,0,0,0,1,0
1,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,1,0,0,0,1,0
2,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,1,0,0,0,1,0
3,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,1,0,0,0,1,0
4,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,1,0,0,0,1,0


In [191]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2925 entries, 0 to 2924
Columns: 274 entries, Lot Frontage to Sale Condition_Partial
dtypes: float64(11), int64(263)
memory usage: 6.1 MB


## Missing Data

In [192]:
df.isnull().sum()

Lot Frontage              0
Lot Area                  0
Overall Qual              0
Overall Cond              0
Year Built                0
                         ..
Sale Condition_AdjLand    0
Sale Condition_Alloca     0
Sale Condition_Family     0
Sale Condition_Normal     0
Sale Condition_Partial    0
Length: 274, dtype: int64

## Converting Columns Names

In [193]:
df.columns = [label.lower().replace(' ','_') for label in df.columns]

In [194]:
df.columns

Index(['lot_frontage', 'lot_area', 'overall_qual', 'overall_cond',
       'year_built', 'year_remod/add', 'mas_vnr_area', 'bsmtfin_sf_1',
       'bsmtfin_sf_2', 'bsmt_unf_sf',
       ...
       'sale_type_conlw', 'sale_type_new', 'sale_type_oth', 'sale_type_vwd',
       'sale_type_wd_', 'sale_condition_adjland', 'sale_condition_alloca',
       'sale_condition_family', 'sale_condition_normal',
       'sale_condition_partial'],
      dtype='object', length=274)

In [195]:
df.head()

Unnamed: 0,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,...,sale_type_conlw,sale_type_new,sale_type_oth,sale_type_vwd,sale_type_wd_,sale_condition_adjland,sale_condition_alloca,sale_condition_family,sale_condition_normal,sale_condition_partial
0,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,...,0,0,0,0,1,0,0,0,1,0
1,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,...,0,0,0,0,1,0,0,0,1,0
2,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,...,0,0,0,0,1,0,0,0,1,0
3,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,...,0,0,0,0,1,0,0,0,1,0
4,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,...,0,0,0,0,1,0,0,0,1,0


## Training Data

In [196]:
X = df.drop('saleprice',axis=1)
y = df['saleprice']

## Splitting Data

In [197]:
from sklearn.model_selection import train_test_split

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=101)

## Scalling Data

In [199]:
from sklearn.preprocessing import  StandardScaler


In [200]:
scaler = StandardScaler()

In [201]:
scaled_X_train = scaler.fit_transform(X_train)


In [202]:
scaled_X_test = scaler.transform(X_test)

## Elastic Net model

In [203]:
from sklearn.linear_model import ElasticNet

In [204]:
base_model = ElasticNet()

In [229]:
param_grid = {'alpha':[200,300,400,500,800,900],'l1_ratio':[.1, .5, .7, .9, .95, .99, 1],}

## Grid Search For model

In [230]:
from sklearn.model_selection import GridSearchCV

In [231]:
grid_model = GridSearchCV(estimator=base_model,
                          param_grid=param_grid,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          verbose=1)

In [232]:
grid_model.fit(scaled_X_train,y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [200, 300, 400, 500, 800, 900],
                         'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]},
             scoring='neg_mean_squared_error', verbose=1)

In [233]:
grid_model.best_params_

{'alpha': 200, 'l1_ratio': 1}

## Prediction Model

In [225]:
y_pred = grid_model.predict(scaled_X_test)

## Evaluation Model

In [226]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [227]:
mean_absolute_error(y_test,y_pred)

14246.271894286561

In [228]:
np.sqrt(mean_squared_error(y_test,y_pred))

20621.7074578704

In [234]:
np.mean(df['saleprice'])

180815.53743589742