In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
!pip install matplotlib
import matplotlib.pyplot as plt
!pip install seaborn
import seaborn as sns
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

# **Step 1:Import Dataset and Insept Dataset**

In [None]:
housing=pd.read_csv("train.csv")
housing.head()

In [None]:
housing.shape

In [None]:
housing.describe()

In [None]:
housing.info()

In [None]:
housing.isnull().sum()/housing.shape[0]*100

# **Step 2: Data Cleaning**

In [None]:
cols=['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
for i in cols:
    housing[i].fillna("None",inplace=True)

In [None]:
housing.info()

In [None]:
%matplotlib inline

In [None]:
plt.figure(figsize=(6,6))
sns.distplot(housing['SalePrice'])
plt.show()

In [None]:
print("Skewness: %f" % housing['SalePrice'].skew())
print("Kurtosis: %f" % housing['SalePrice'].kurt())

we can observe target variable is > 1 and has high density around sale price of 160k

In [None]:
housing['SalePrice']=np.log(housing['SalePrice'])

In [None]:
plt.figure(figsize=(6,6))
sns.distplot(housing['SalePrice'])
plt.show()

In [None]:
print("Skewness: %f" % housing['SalePrice'].skew())
print("Kurtosis: %f" % housing['SalePrice'].kurt())

we can now see normal distribution and skewness and kurtosis are reduced

- convert 'MSSubClass','OverallQual','OverallCond' to object datatype
- convert 'LotFrontage','MasVnrArea' to Numeric datatype

In [None]:
housing.drop("Id", axis=1, inplace=True)

In [None]:
housing[['MSSubClass','OverallQual','OverallCond' ]] = housing[['MSSubClass','OverallQual','OverallCond' ]].astype('object')

In [None]:
housing['LotFrontage']=pd.to_numeric(housing['LotFrontage'],errors='coerce')
housing['MasVnrArea']=pd.to_numeric(housing['MasVnrArea'],errors='coerce')

In [None]:
housing.info()

In [None]:
null_cols=housing.columns[housing.isnull().any()]
null_cols

In [None]:
for i in null_cols:
    if housing[i].dtype==  np.float64 or housing[i].dtype ==np.int64:
        housing[i].fillna(housing[i].mean(),inplace=True)
    else:
        housing[i].fillna(housing[i].mode()[0],inplace=True)

In [None]:
housing.isna().sum()

# **Step 3: Exploratory Data Analysis on the Dataset**

In [None]:
cat_cols=housing.select_dtypes(include='object').columns
cat_cols

In [None]:
num_cols=housing.select_dtypes(include=['int64','float64']).columns
num_cols

**Univarient Analysis**

In [None]:
for i in num_cols:
    plt.figure(figsize=[8,5])
    print(i)
    sns.boxplot(housing[i])
    plt.show()

We can see outliers in LotFrontage,LotArea,YearBuilt,MaxVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,etc...

In [None]:
for i in cat_cols:
  print(housing[i].value_counts(normalize=True))
  plt.figure(figsize=[5,5])
  housing[i].value_counts(normalize=True).plot.pie(labeldistance=None,autopct ='%1.2f%%')
  plt.legend()
  plt.show()
  print("------------------------------------------------------")


**Bivariable/Multivariable Analysis on Dataset**

In [None]:
sns.barplot(x='MSZoning',y='LotFrontage',data=housing)
plt.show()

In [None]:
sns.barplot(x='MSSubClass',y='LotFrontage',data=housing)
plt.show()

In [None]:
sns.barplot(x='HouseStyle',y='SalePrice',hue='Street',data=housing)
plt.show()

In [None]:
sns.barplot(x='BldgType',y='SalePrice',data=housing)

In [None]:
sns.barplot(x='BsmtQual',y='SalePrice',data=housing)
plt.show()

In [None]:
housing["Age"]=housing["YrSold"]-housing["YearBuilt"]
housing["Age"].head()

In [None]:
housing.drop(columns=["YearBuilt","YrSold",],inplace=True)

In [None]:
housing.head()

**Correlation between Numerical Columns**

In [None]:
plt.figure(figsize=[25,25])
sns.heatmap(housing.corr(numeric_only=True),annot=True,cmap='BuPu')
plt.title("Correlation of Numerical Columns")

**Get top 10 correlated columns**

In [None]:
k=10
plt.figure(figsize=[15,15])
cols=housing.corr(numeric_only=True).nlargest(k,'SalePrice').index
cm=np.corrcoef(housing[cols].values.T)
sns.heatmap(cm,annot=True,square=True,fmt='.2f',cbar=True,annot_kws={'size':10},
            yticklabels=cols.values,xticklabels=cols.values)
plt.show()

**PairPlot for numeric Columns**

In [None]:
cols=["SalePrice","OverallQual","GrLivArea","GarageCars","GarageArea","TotalBsmtSF","1stFlrSF"]
plt.figure(figsize=[20,20])
sns.pairplot(housing[cols])
plt.show()

# **Step 4: Data Preparation**

**Dummy Encoding**

In [None]:
housing_num=housing.select_dtypes(include=['int64','float64'])
housing_cat=housing.select_dtypes(include='object')

In [None]:
housing_cat

In [None]:
housing_cat_dm=pd.get_dummies(housing_cat,drop_first=True,dtype=int)

In [None]:
housing_cat_dm

In [None]:
house=pd.concat([housing_num,housing_cat_dm],axis=1)
house.head()

In [None]:
house.shape

In [None]:
X=house.drop(['SalePrice'],axis=1).copy()
y=house['SalePrice'].copy()

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train.shape

In [None]:
y_train.shape

**Scaling the dataset with standard Scalar**

In [None]:
num_cols=list(X_train.select_dtypes(include=['int64','float64']).columns)
scalar=StandardScaler()
X_train[num_cols]=scalar.fit_transform(X_train[num_cols])
X_test[num_cols]=scalar.transform(X_test[num_cols])

**Building a function to calculate evaluation metrics**

In [None]:
def eval_metrics(y_train,y_train_pred,y_test,y_test_pred):
  print("r2 score (train)=",'%2f' %r2_score(y_train,y_train_pred))
  print("r2 score (test)=",'%2f' %r2_score(y_test,y_pred))

  mse_train=mean_squared_error(y_train,y_train_pred)
  mse_test=mean_squared_error(y_test,y_pred)
  rmse_train=mse_train**0.5
  rmse_test=mse_test**0.5

  print("RMSE (train)=",'%.2f' % rmse_train)
  print("RMSE (test)=",'%.2f' % rmse_test)

# **step 5: Build ML Model**

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

In [None]:
params={'alpha':
        [0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,2.0,3.0,4.0,
         5.0,6.0,7.0,8.0,9.0,10,20,50,100,500,1000]}
ridge=Ridge()
ridgeCV=GridSearchCV(estimator=ridge,param_grid=params,scoring='neg_mean_absolute_error',cv=5,
                     return_train_score=True,verbose=1,n_jobs=-1)
ridgeCV.fit(X_train,y_train)

In [None]:
ridgeCV.best_params_

In [None]:
ridgeCV.cv_results_

In [None]:
ridge=Ridge(alpha=9)
ridge.fit(X_train,y_train)

In [None]:
ridge.coef_

In [None]:
y_train_pred=ridge.predict(X_train)
y_pred=ridge.predict(X_test)

In [None]:
eval_metrics(y_train,y_train_pred,y_test,y_pred)

In [None]:
ridgeCV_res=pd.DataFrame(ridgeCV.cv_results_)
ridgeCV_res.head

In [None]:
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_test_score'],label='test')
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_train_score'],label='train')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
lasso=Lasso()
lassoCV=GridSearchCV(estimator=lasso,param_grid=params,scoring='neg_mean_absolute_error',cv=5,
                     return_train_score=True,verbose=1,n_jobs=-1)
lassoCV.fit(X_train,y_train)

In [None]:
lassoCV.best_params_

In [None]:
lasso=Lasso(alpha=0.0001)
lasso.fit(X_train,y_train)

In [None]:
lasso.coef_

In [None]:
y_train_pred1=lasso.predict(X_train)
y_pred1=lasso.predict(X_test)

In [None]:
eval_metrics(y_train,y_train_pred1,y_test,y_pred1)

In [None]:
lassoCV_res=pd.DataFrame(lassoCV.cv_results_)
lassoCV_res.head()

In [None]:
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_test_score'],label='test')
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_train_score'],label='train')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show()

**feature Extraction /Elimination**


In [None]:
betas=pd.DataFrame(index=X.columns)
betas.rows=X.columns
betas.loc[:,'Ridge']=ridge.coef_
betas.loc[:,'Lasso']=lasso.coef_
betas

In [None]:
lasso_cols_removed=list(betas[betas['Lasso']==0].index)
print(lasso_cols_removed)

In [None]:
lasso_cols_selected=list(betas[betas['Lasso']!=0].index)
print(lasso_cols_selected)

In [None]:
print(len(lasso_cols_removed))
print(len(lasso_cols_selected))

**Top 10 features significient in pretending the value of a house, both according to Ridge model and Lasso model**

In [None]:
betas['Ridge'].sort_values(ascending=False).head(10)

In [None]:
lasso_coeffs=np.exp(betas['Lasso'])
lasso_coeffs.sort_values(ascending=False)[:10]

# **Conclusion**
**Below are the top 10 features significient in pretending the value of a house, both according to Ridge model**


OverallQual_9	0.105410

Neighborhood_StoneBr	0.088724

OverallQual_8	0.084013

Neighborhood_Crawfor	0.082484

Exterior1st_BrkFace	0.080016

Neighborhood_NridgHt	0.069636

BsmtQual_Ex	0.066404

Functional_Typ	0.065846

OverallCond_9	0.064157

GrLivArea	0.060228


**Below are the top 10 features significient in pretending the value of a house, both according to Lasso model**

GarageCond_Po	1.179230

OverallQual_10	1.154648

OverallQual_9	1.146612

SaleType_Oth	1.137107

Neighborhood_StoneBr	1.135340

Neighborhood_Crawfor	1.116512

Exterior1st_BrkFace	1.113052

GarageQual_Ex	1.109625

SaleCondition_Alloca	1.103749

OverallCond_9	1.099452



**Few Inferences are :**


. Therefore the price of the house will increase by 1.11 with the increase in GrLivArea

. The price of house can increase by 1.08 times if the finish of the house is Very Good

. If the house has centralized AC the price can increase by 1.08 times

. If the basement condition is typical then the house price may increase upto 1.06 times

. The price of the house may increase if the neighborhood has Crawford,Stone Brook and Northridge Heights as Physical locations within Ames city limits


**Optimal value of lambda for Ridge Regression = 9**

**Optimal value of lambda for Lasso = 0.001**