In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas_profiling
import warnings

# Practice Skills
* Creative feature engineering 
* Advanced regression techniques like random forest and gradient boosting

In [None]:
df_train=pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
pd.options.display.max_columns = None

# Exploratory Data Analysis(EDA)

In [None]:
df_train

In [None]:
df_test

There are 1460 instances of training data and 1460 of test data. Total number of attributes equals 81, of which 36 is quantitative, 43 categorical + Id and SalePrice.

* Quantitative: 1stFlrSF, 2ndFlrSF, 3SsnPorch, BedroomAbvGr, BsmtFinSF1, BsmtFinSF2, BsmtFullBath, BsmtHalfBath, BsmtUnfSF, EnclosedPorch, Fireplaces, FullBath, GarageArea, GarageCars, GarageYrBlt, GrLivArea, HalfBath, KitchenAbvGr, LotArea, LotFrontage, LowQualFinSF, MSSubClass, MasVnrArea, MiscVal, MoSold, OpenPorchSF, OverallCond, OverallQual, PoolArea, ScreenPorch, TotRmsAbvGrd, TotalBsmtSF, WoodDeckSF, YearBuilt, YearRemodAdd, YrSold

* Qualitative: Alley, BldgType, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, BsmtQual, CentralAir, Condition1, Condition2, Electrical, ExterCond, ExterQual, Exterior1st, Exterior2nd, Fence, FireplaceQu, Foundation, Functional, GarageCond, GarageFinish, GarageQual, GarageType, Heating, HeatingQC, HouseStyle, KitchenQual, LandContour, LandSlope, LotConfig, LotShape, MSZoning, MasVnrType, MiscFeature, Neighborhood, PavedDrive, PoolQC, RoofMatl, RoofStyle, SaleCondition, SaleType, Street, Utilities,

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
df_train.drop("Id", axis = 1, inplace = True)


In [None]:
#Correlation map to see how features are correlated with SalePrice
corrmat = df_train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

In [None]:
print("\nThe train data size after dropping Id feature is : {} ".format(df_train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(df_test.shape))


In [None]:
 #most correlated features
corrmat = df_train.corr()
top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>0.5]
plt.figure(figsize=(10,10))
g = sns.heatmap(df_train[top_corr_features].corr(),annot=True,cmap="RdYlGn")

most of the features are correlated with each other like Garage Cars and Garage Area

OverallQual is highly correlated with target feature SalePrice 0.79 can you see. 

In [None]:
sns.barplot(df_train.OverallQual,df_train.SalePrice)

here we can see how OverallQual is corelated

In [None]:
sns.barplot(df_train.GrLivArea,df_train.SalePrice)

In [None]:
sns.set()
cols = [ 'SalePrice','OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']

    #plt.show()
sns.pairplot(df_train[cols], size = 2.5)
plt.show();

In [None]:
from scipy import stats
from scipy.stats import norm, skew #for some statistics

sns.distplot(df_train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
print("skewness of salesprice is :", df_train["SalePrice"].skew())
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
plt.show()

In [None]:
df_train.SalePrice = np.log1p(df_train.SalePrice )
y = df_train.SalePrice
y

In [None]:
print("skewness of salesprice is :", df_train["SalePrice"].skew())
#train['SalePrice'] = np.log(train['SalePrice']+1)
sns.distplot(df_train['SalePrice'],fit=norm)
(mu, sig) = norm.fit(df_train['SalePrice'])
plt.ylabel('Frequency')
plt.legend(["Skewness: {:.2f}".format(df_train['SalePrice'].skew())])
plt.show()
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
plt.show()

# Data Preprocessing and Data Cleaning

In [None]:
plt.scatter(y =df_train.SalePrice,x = df_train.GrLivArea,c = 'black')
plt.show()
#we can see the outlier in the below image

In [None]:
train_nas = df_train.isnull().sum()
train_nas = train_nas[train_nas>0]
train_nas.sort_values(ascending=False)

In [None]:
test_nas = df_test.isnull().sum()
test_nas = test_nas[test_nas>0]
test_nas.sort_values(ascending = False)

In [None]:
print("Find most important features relative to target")
corr = df_train.corr()
corr.sort_values(["SalePrice"], ascending = False, inplace = True)
print(corr.SalePrice)
#this you can see at the time of heatmap also

1. HERE SalePrice is mostly related to 
   OverallQual      
   GrLivArea        
   GarageCars       
   GarageArea       
   TotalBsmtSF      
   1stFlrSF         
   FullBath        
   YearBuilt       
   YearRemodAdd     

In [None]:
# Differentiate numerical features (minus the target) and categorical features
categorical_features = df_train.select_dtypes(include=['object']).columns
categorical_features

In [None]:
#numerical data
numerical_features = df_train.select_dtypes(exclude = ["object"]).columns
numerical_features

In [None]:
#numerical_features = numerical_features.drop("SalePrice")
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
#FOR NUMERIC
train_num = df_train[numerical_features]
#FOR CATEGORICAL
train_cat = df_train[categorical_features]

In [None]:
sns.set_style("whitegrid")
missing = df_train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

In [None]:
all_data_na = (df_train.isnull().sum() / len(df_train)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)


In [None]:
# Handle remaining missing values for numerical features by using median as replacement
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))

In [None]:
# Handle remaining missing values for catergorical features by using median as replacement
print("NAs for catergorical features in train : " + str(train_cat.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print("Remaining NAs for catergorical features in train : " + str(train_cat.isnull().values.sum()))

* LotFrontage : Since the area of each street connected to the house property most likely have a similar area to other houses in its neighborhood , we can fill in missing values by the median LotFrontage of the neighborhood.
* GarageType, GarageFinish, GarageQual and GarageCond : Replacing missing data with None
* GarageYrBlt, GarageArea and GarageCars : Replacing missing data with 0 (Since No garage = no cars in such garage.)
* BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath : missing values are likely zero for having no basement
* BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1 and BsmtFinType2 : For all these categorical basement-related features, NaN means that there is no basement.
* MasVnrArea and MasVnrType : NA most likely means no masonry veneer for these houses. We can fill 0 for the area and None for the type.
* MSZoning (The general zoning classification) : 'RL' is by far the most common value. So we can fill in missing values with 'RL'
* Utilities : For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . Since the house with 'NoSewa' is in the training set, this feature won't help in predictive modelling. We can then safely remove it.
* Functional : data description says NA means typical
* Electrical : It has one NA value. Since this feature has mostly 'SBrkr', we can set that for the missing value.
* KitchenQual: Only one NA value, and same as Electrical, we set 'TA' (which is the most frequent) for the missing value in KitchenQual.
* Exterior1st and Exterior2nd : Again Both Exterior 1 & 2 have only one missing value. We will just substitute in the most common string
* SaleType : Fill in again with most frequent which is "WD"
* MSSubClass : Na most likely means No building class. We can replace missing values with None


In [None]:
#Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
df_train["LotFrontage"] = df_train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

#GarageType, GarageFinish, GarageQual and GarageCond : Replacing missing data with None
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    df_train[col] = df_train[col].fillna('None')
    
#GarageYrBlt, GarageArea and GarageCars : Replacing missing data with 0 (Since No garage = no cars in such garage.)
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    df_train[col] = df_train[col].fillna(0)
    
#BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
#missing values are likely zero for having no basement
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    df_train[col] = df_train[col].fillna(0)
    
#BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1 and BsmtFinType2 : 
#For all these categorical basement-related features, NaN means that there is no basement.
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df_train[col] = df_train[col].fillna('None')
    
#MasVnrArea and MasVnrType : NA most likely means no masonry veneer for these houses.
#We can fill 0 for the area and None for the type.
df_train["MasVnrType"] = df_train["MasVnrType"].fillna("None")
df_train["MasVnrArea"] = df_train["MasVnrArea"].fillna(0)

#MSZoning (The general zoning classification) : 'RL' is by far the most common value. 
#So we can fill in missing values with 'RL'
df_train['MSZoning'] = df_train['MSZoning'].fillna(df_train['MSZoning'].mode()[0])

#Utilities : For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . 
#Since the house with 'NoSewa' is in the training set, this feature won't help in predictive modelling. 
#We can then safely remove it.
df_train = df_train.drop(['Utilities'], axis=1)

#Functional : data description says NA means typical
df_train["Functional"] = df_train["Functional"].fillna("Typ")

#Electrical : It has one NA value. Since this feature has mostly 'SBrkr', we can set that for the missing value.
df_train['Electrical'] = df_train['Electrical'].fillna(df_train['Electrical'].mode()[0])

#KitchenQual: Only one NA value, and same as Electrical,
#we set 'TA' (which is the most frequent) for the missing value in KitchenQual.
df_train['KitchenQual'] = df_train['KitchenQual'].fillna(df_train['KitchenQual'].mode()[0])

#Exterior1st and Exterior2nd : Again Both Exterior 1 & 2 have only one missing value.
#We will just substitute in the most common string
df_train['Exterior1st'] = df_train['Exterior1st'].fillna(df_train['Exterior1st'].mode()[0])
df_train['Exterior2nd'] = df_train['Exterior2nd'].fillna(df_train['Exterior2nd'].mode()[0])

#SaleType : Fill in again with most frequent which is "WD"
df_train['SaleType'] = df_train['SaleType'].fillna(df_train['SaleType'].mode()[0])

#MSSubClass : Na most likely means No building class. We can replace missing values with None
df_train['MSSubClass'] = df_train['MSSubClass'].fillna("None")

df_train["PoolQC"] = df_train["PoolQC"].fillna("None")
#MiscFeature : data description says NA means "no misc feature"

df_train["MiscFeature"] = df_train["MiscFeature"].fillna("None")
#Alley : data description says NA means "no alley access"
df_train["Alley"] = df_train["Alley"].fillna("None")
#Fence : data description says NA means "no fence"

df_train["Fence"] = df_train["Fence"].fillna("None")
#FireplaceQu : data description says NA means "no fireplace"
df_train["FireplaceQu"] = df_train["FireplaceQu"].fillna("None")


In [None]:
# Some numerical features are actually really categories
df_train = df_train.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

In [None]:
 #Encode some categorical features as ordered numbers when there is information in the order
df_train = df_train.replace({"Alley" : {"Grvl" : 1, "Pave" : 2},
                       "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 1, "Pave" : 2},
                       "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}}
                     )

In [None]:
# Differentiate numerical features (minus the target) and categorical features
categorical_features = df_train.select_dtypes(include = ["object"]).columns
numerical_features = df_train.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("SalePrice")
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = df_train[numerical_features]
train_cat = df_train[categorical_features]

In [None]:
df_test.drop(['Alley','MiscFeature','Fence','PoolQC'], axis=1, inplace=True)

In [None]:
df_test["LotFrontage"].fillna(df_test['LotFrontage'].mean(), inplace=True)
df_test["MasVnrType"]=df_test["MasVnrType"]=df_test["MasVnrType"].replace(np.nan, "unknown")
df_test["Utilities"]=df_test["Utilities"].replace(np.nan, "unknown")
df_test["Exterior1st"]=df_test["Exterior1st"].replace(np.nan, "unknown")
df_test["Exterior2nd"]=df_test["Exterior2nd"].replace(np.nan, 'unknown')
df_test["MasVnrArea"].fillna(df_test['MasVnrArea'].mean(), inplace=True) 
df_test['BsmtQual']=df_test["BsmtQual"].replace(np.nan, 'unknown')
df_test["BsmtFullBath"].fillna(df_test["BsmtFullBath"].mean(), inplace=True)
df_test["KitchenQual"]=df_test["KitchenQual"].replace(np.nan, 'unknown')
df_test["Functional"]=df_test["Functional"].replace(np.nan, 'unknown')
df_test["FireplaceQu"]= df_test["FireplaceQu"].replace(np.nan, 'unknown')
df_test["GarageType"]=df_test['GarageType'].replace(np.nan, 'unknown')
df_test["GarageYrBlt"].fillna(df_test['GarageYrBlt'].mean(), inplace=True)
df_test["GarageFinish"]=df_test['GarageFinish'].replace(np.nan, 'unknown')
df_test["GarageCars"].fillna(df_test['GarageCars'].mean(), inplace=True)
df_test["GarageArea"].fillna(df_test['GarageArea'].mean(), inplace=True)
df_test["GarageQual"]=df_test["GarageQual"].replace(np.nan, 'unknown')
df_test["GarageCond"]=df_test["GarageCond"].replace(np.nan, 'unknown')
df_test["SaleType"]=df_test["SaleType"].replace(np.nan, 'unknown')
df_test["BsmtFinType1"]=df_test["BsmtFinType1"].replace(np.nan, "unknown")
df_test["BsmtFinType2"]=df_test["BsmtFinType2"].replace(np.nan, "unknown")
df_test["BsmtExposure"]=df_test["BsmtExposure"].replace(np.nan, "unknown")
df_test["MSZoning"]=df_test["MSZoning"].replace(np.nan, "unknown")
df_test["BsmtFinSF1"].fillna(df_test['BsmtFinSF1'].mean(), inplace=True)
df_test["BsmtFinSF2"].fillna(df_test['BsmtFinSF2'].mean(), inplace=True)
df_test["BsmtUnfSF"].fillna(df_test['BsmtUnfSF'].mean(), inplace=True)
df_test["TotalBsmtSF"].fillna(df_test['TotalBsmtSF'].mean(), inplace=True)
df_test["BsmtHalfBath"].fillna(df_test['BsmtHalfBath'].mean(), inplace=True)
df_test["BsmtCond"]=df_test["BsmtCond"].replace(np.nan, "unknown")

In [None]:
df_test.columns[df_test.isnull().any()]

In [None]:
sns.heatmap(df_test.isnull(), cbar=False)

In [None]:
#for train set
all_data_na = (df_train.isnull().sum() / len(df_train)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

NO MISSING RATIO

In [None]:
#for test set
all_data_na = (df_test.isnull().sum() / len(df_test)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

NO MISSING RATIO

In [None]:
df_train.isnull().sum().sort_values(ascending = False)

In [None]:
 #Create dummy features for categorical values via one-hot encoding
train_cat.shape

In [None]:
train_cat.head()
#HERE IS THE VALUES CATEGORICAL

In [None]:
train_num.head()
#HERE IS THE VALUES NUMERIC

In [None]:
str(train_cat.isnull().values.sum())

In [None]:
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import ExtraTreesRegressor  
from sklearn.metrics import mean_squared_error

In [None]:

df_train = pd.concat([train_cat,train_num],axis=1)
df_train.shape

In [None]:
df_train.tail()

In [None]:
 ## Combining train and test datasets together so that we can do all the work at once. 
all_data = pd.concat((df_train, df_test)).reset_index(drop = True)
#all_data.drop(['SalePrice'], axis = 1, inplace = True)
#y = df_train['SalePrice'].reset_index(drop=True)

In [None]:
all_data= all_data.drop(['Street', 'PoolQC',], axis=1)
all_data.shape

In [None]:
## Creating dummy variable 
final_features = pd.get_dummies(all_data).reset_index(drop=True)
final_features.shape

In [None]:
#filling NA's with the mean of the column:
final_features = final_features.fillna(final_features.mean())


In [None]:
X = final_features.iloc[:len(y), :]

X_sub = final_features.iloc[len(y):]

In [None]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X=sc.fit_transform(X)


# Train_test split
We have separated dependent and independent features; We have separated train and test data. So, why do we still have to split our training data? If you are curious about that, I have the answer. For this competition, when we train the machine learning algorithms, we use part of the training set, usually two-thirds of the train data. Once we train our algorithm using 2/3 of the train data, we start to test our algorithms using the remaining data. If the model performs well, we dump our test data in the algorithms to predict and submit the competition. The code below, basically splits the train data into 4 parts,** X_train, X_test, y_train, y_test.**

 * X_train, y_train first used to train the algorithm.
 * then, X_test is used in that trained algorithms to predict outcomes.
 * Once we get the outcomes, we compare it with y_test
 
 * By comparing the outcome of the model with test_y, we can determine whether our algorithms are performing well or not.

In [None]:
from sklearn.model_selection import train_test_split
## Train test split follows this distinguished code pattern and helps creating train and test set to build machine learning. 
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state = 0)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
y_train.values.reshape((-1,1))
y_test.values.reshape((-1,1))
y_train=y_train.transpose()
y_test=y_test.transpose()

In [None]:
y_train.shape, y_test.shape

# TIME FOR MODELING
* Before modeling each algorithm, I would like to discuss them for a better understanding. This way I would review what I know and at the same time help out the community. 
* Introducing**Linear Regression**, one of the most basic and straightforward models. Many of us may have learned to show the relationship between two variable using something called "y equals mX plus b." Let's refresh our memory and call upon on that equation.
# >                   **y=mX+b**
* m = slope of the regression line. It represents the relationship between X and y. In another word, it gives weight as to for each x(horizontal space) how much y(vertical space) we have to cover. In machine learning, we call it coefficient.
* b = y-intercept.
* x and y are the data points located in x_axis and y_axis respectively.

*MSE(Mean Squared Error) *
# >** MSE=1n∑i=1n(yi^−yi)2**
*MAE (Mean Absolute Error) *
# >  MAE=∑ni=1|y¯−yi|n

In [None]:
## importing necessary models.
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Call in the LinearRegression object
lin_reg = LinearRegression(normalize=True, n_jobs=-1)
## fit train and test data. 
lin_reg.fit(X_train, y_train)
## Predict test data. 
y_pred = lin_reg.predict(X_test)

In [None]:
y_pred

In [None]:
# get average squared error(MSE) by comparing predicted values with real values. 
print ('%.2f'%mean_squared_error(y_test, y_pred))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
lin_reg = LinearRegression()
cv = KFold(shuffle=True, random_state=2, n_splits=8)
scores = cross_val_score(lin_reg, X,y,cv = cv, scoring = 'neg_mean_absolute_error')

In [None]:
print ('%.8f'%scores.mean())

# Regularization Models
What makes regression model more effective is its ability of regularizing. The term "regularizing" stands for models ability to structurally prevent overfitting by imposing a penalty on the coefficients.

There are three types of regularizations.

* Ridge
* Lasso
* Elastic Net
These regularization methods work by penalizing the magnitude of the coefficients of features and at the same time minimizing the error between the predicted value and actual observed values. This minimization becomes a balance between the error (the difference between the predicted value and observed value) and the size of the coefficients. The only difference between Ridge and Lasso is the way they penalize the coefficients. Elastic Net is the combination of these two. Elastic Net adds both the sum of the squares errors and the absolute value of the squared error. To get more in-depth of it, let us review the least squared loss function.

Ordinary least squared loss function minimizes the residual sum of the square(RSS) to fit the data:

# minimize:RSS=∑i=1n(yi−y^i)2=∑i=1n(yi−(β0+∑j=1pβjxj))2
 
Let's review this equation once again, Here:

yi  is the observed value.
y^i  is the predicted value.
The error =  yi  -  y^i 
The square of the error = ** (yi−y^i)2 **
The sum of the square of the error =  ∑ni=1(yi−y^i)2 , that's the equation on the left.
The only difference between left sides equation vs. the right sides one above is the replacement of  y^i , it is replaced by  (β0+∑pj=1βjxj) , which simply follow's the slope equation, y = mx+b, where,
β0  is the intercept.
βj  is the coefficient of the feature( xj ).
Let's describe the effect of regularization and then we will learn how we can use loss function in Ridge.

One of the benefits of regularization is that it deals with multicollinearity(high correlation between predictor variables) well, especially Ridge method. Lasso deals with multicollinearity more brutally by penalizing related coefficients and force them to become zero, hence removing them. However, Lasso is well suited for redundant variables.
Ridge:
Ridge regression adds penalty equivalent to the square of the magnitude of the coefficients. This penalty is added to the least square loss function above and looks like this...

# minimize:RSS+Ridge=∑i=1n(yi−(β0+∑j=1pβjxj))2+λ2∑j=1pβ2j
Here,

λ2  is constant; a regularization parameter. It is also known as  α . The higher the value of this constant the more the impact in the loss function.
When  λ2  is 0, the loss funciton becomes same as simple linear regression.
When  λ2  is  ∞ , the coefficients become 0
When  λ2  is between 0 and  ∞ (0< λ2 < ∞ ), The  λ2  parameter will decide the miagnitude given to the coefficients. The coefficients will be somewhere between 0 and ones for simple linear regression.
∑pj=1β2j , is the squared sum of all coefficients.
Now that we know every nitty-gritty details about this equation, let's use it for science, but before that a couple of things to remember.

It is essential to standardize the predictor variables before constructing the models.
It is important to check for multicollinearity,

In [None]:
num_folds = 10 
seed = 8
scoring = 'neg_mean_squared_error'
t = []
t.append(('LR', LinearRegression()))
t.append(('LASSO', Lasso()))
t.append(('EN', ElasticNet()))
t.append(('KNN', KNeighborsRegressor()))
t.append(('CART', DecisionTreeRegressor())) 
t.append(('SVR', SVR()))



results = []
names = []
for name, model in t: 
    kfold = KFold(n_splits=num_folds, random_state=seed) 
    cv_results = cross_val_score(model, X_train,y_train, cv=kfold, scoring=scoring) 
    results.append(cv_results)
    names.append(name) 
    print(name, cv_results.mean(), cv_results.std()) 

In [None]:
import xgboost as xgb


In [None]:
dtrain = xgb.DMatrix(X_train, label = y)
dtest = xgb.DMatrix(X_test)

params = {"max_depth":2, "eta":0.1}
model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)

In [None]:
model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot()

# L1 Regularisation (Lasso)(Least Absolute Shrinkage and Selection Operator)


* Having a large number of samples (n) with respect to the number of dimensionality (d) increases the quality of our model.
* One way to reduce the eﬀective number of dimensions is to use those that most contribute to the signal and ignore those that mostly act as noise.
* L1 regularization achieves this by adding a penalty that results in the weight for the dimensions that act as noise becoming 0. 
* L1 regularisation encourages a sparse vector of weights in which few are non-zero and many are zero. 


In [None]:
#from sklearn.ensemble import Lasso
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1, normalize=True) 
lasso.fit(X_train, y_train) 
lasso_pred = lasso.predict(X_test) 

In [None]:
print(mean_squared_error(y_test,lasso_pred ))

In [None]:
 lasso.score(X_test, y_test) 

# L2 Regularisation (Ridge)

* Ridge regression is an L2 penalized model where we simply add the squared sum of the weights to our least-squares cost function:



* By increasing the value of the hyperparameter λ , we increase the regularization strength and shrink the weights of our model.


In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1, normalize=True) 
ridge.fit(X_train, y_train) 
ridge_pred = ridge.predict(X_test)
print(mean_squared_error(y_test,ridge_pred ))
ridge.score(X_test, y_test) 


In [None]:
from sklearn.ensemble import RandomForestRegressor
my_model = RandomForestRegressor()
my_model.fit(X_train, y_train)

In [None]:
random_predicted_prices = my_model.predict(X_test)
print(mean_squared_error(y_test,random_predicted_prices ))

In [None]:
print(random_predicted_prices)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
scaler = StandardScaler().fit(X_train) 
rescaledX = scaler.transform(X_train)
model = GradientBoostingRegressor(random_state=seed, n_estimators=400)
model.fit(rescaledX, y_train)

In [None]:
rescaledValidationX = scaler.transform(X_test)
gradient_predictions = model.predict(X_test) 
print(mean_squared_error(y_test, gradient_predictions))

In [None]:
print(gradient_predictions)

In [None]:
prediction_list=[lasso_pred,random_predicted_prices,gradient_predictions,ridge_pred]
for i in prediction_list:
    submit_test = pd.concat([df_test['Id'],pd.DataFrame(i)], axis=1)
    submit_test.columns=['Id', 'SalePrice']

In [None]:
submit_test

In [None]:
submit_test1.to_csv('submission.csv', index=False )

In [None]:
def create_download_link(title = "Download CSV file", filename = "submission.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

In [None]:
from IPython.display import HTML
create_download_link(filename='submission.csv')