In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")


In [None]:
X_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')

In [None]:
X_train.info()

In [None]:
X_train.head()

In [None]:
#get correlation of features vs labels
X_train.corr().sort_values(by='SalePrice')['SalePrice']

## Numerical Features

In [None]:
#selecting features with correlation higher than 0.5 from train set
X_train_num = X_train[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','SalePrice']]
X_train_num.info()

In [None]:
#selecting features with correlation higher than 0.5 from test set
X_test_num = X_test[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd']]
X_test_num.info()

## Categorical Features

In [None]:
#selecting categorical columns from train set 
X_train_cat=X_train.select_dtypes(include=['object'])
X_train_cat.info()

In [None]:
X_train_cat.head()

In [None]:
#transfom NaN in Alley to No alley access, [BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2] to 'No Basement'
#FireplaceQu to No Fireplace
#[GarageType,GarageFinish,GarageQual,GarageCond] to No Garage
#PoolQC to No Pool, Fence to No Fence, MiscFeature to None 
def replaceNaN(df):
    df.fillna({'Alley':'No alley access',
                   'BsmtQual':'No Basement',
                   'BsmtCond':'No Basement',
                   'BsmtExposure':'No Basement',
                   'BsmtFinType1':'No Basement',
                   'BsmtFinType2':'No Basement',
                    'FireplaceQu':'No Fireplace',
                   'GarageType':'No Garage',
                   'GarageFinish':'No Garage',
                   'GarageQual':'No Garage',
                   'GarageCond':'No Garage',
                   'PoolQC':'No Pool',
                   'Fence':'No Fence',
                   'MiscFeature':'None'},
                  inplace = True)


In [None]:
X_train_cat.head()


In [None]:
#replace NaN in train set
replaceNaN(X_train_cat)
X_train_cat.info()

In [None]:
#selecting categorical columns from test set 
X_test_cat=X_test.select_dtypes(include=['object'])
X_test_cat.info()

In [None]:
# replace NaN in test set
replaceNaN(X_test_cat)
X_test_cat.info()

In [None]:
#concat test and train sets 
train_id = X_train_cat.index
test_id = X_test_cat.index
data = pd.concat([X_train_cat,X_test_cat])
data.head()

In [None]:
# ordinal encoding
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
colsOrdinal = ['LotShape', 'Utilities', 'LandSlope', 'HouseStyle', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC']
ordinal=OrdinalEncoder()
data[colsOrdinal] = pd.DataFrame(ordinal.fit_transform(data[colsOrdinal]),columns=colsOrdinal)
data.head()

In [None]:
#one hot encoding 
cols = list(set(data.columns) - set(colsOrdinal))
oneHot=OneHotEncoder()
oneHotArr = oneHot.fit_transform(data[cols]).toarray()
labels = np.array(oneHot.get_feature_names_out()).ravel()
OneHotData = pd.DataFrame(oneHotArr, columns = labels, index = data.index)
OneHotData.head()

In [None]:
#concat one hot with ordinal 
data=pd.concat([data[colsOrdinal],OneHotData],axis=1)
data.head()

In [None]:
#separate train and test sets
X_train_cat=data.loc[train_id]
X_train_cat

In [None]:
X_test_cat=data.loc[test_id]
X_test_cat

## Concatenate numerical with categorical

In [None]:
#train set
X_train_cat.index=X_train_num.index
X_train_prepared = pd.concat([X_train_cat,X_train_num],axis=1)
X_train_prepared.head()

In [None]:
#test set
X_test_cat.index=X_test_num.index
X_test_prepared = pd.concat([X_test_cat,X_test_num],axis=1)
X_test_prepared.head()

In [None]:
#separate labels from training set 
train_labels=pd.DataFrame(X_train_prepared['SalePrice'],columns=['SalePrice'])
print(train_labels.head())
X_train_prepared=X_train_prepared.drop(['SalePrice'],axis=1)
X_train_prepared.info()

In [None]:
#minmax normalization
def normalize(train,test):
    train_normalized = train.copy()
    test_normalized = test.copy()
    
    
    for column in train_normalized.columns:
        mini=min(train_normalized[column].min(),test_normalized[column].min())
        maxi = max(train_normalized[column].max(),test_normalized[column].max())
    
        
        train_normalized[column] = (train_normalized[column]- mini) / (maxi-mini)
        test_normalized[column] = (test_normalized[column]- mini) / (maxi-mini)
    return train_normalized,test_normalized

In [None]:
#Apply data normalization 
cols = X_train_prepared.columns
train,test = normalize(X_train_prepared,X_test_prepared)
X_train_prepared = pd.DataFrame(train,columns=cols)
X_test_prepared = pd.DataFrame(test,columns=cols)
print(X_train_prepared.head())
print(X_test_prepared.head())


In [None]:
#create imputer with median from training set 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
imputer.fit(X_train_prepared)

In [None]:
#fill null values in train set
cols = X_train_prepared.columns
inds = X_train_prepared.index
X_train_prepared = pd.DataFrame(imputer.transform(X_train_prepared),columns=cols,index=inds)


In [None]:
X_train_prepared.info()

In [None]:
#fill null values in test set
inds=X_test_prepared.index
X_test_prepared = pd.DataFrame(imputer.transform(X_test_prepared),columns=cols,index=inds)

In [None]:
X_test_prepared.info()

## Regression Models 

In [None]:
# Divide data into train and validation sets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train_prepared, train_labels, test_size = 0.2, random_state=42)


#### Linear Regression 

In [None]:
#Linear Regression 
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, train_labels)

In [None]:
y_pred = lin_reg.predict(X_test_prepared)

In [None]:
y_pred = pd.DataFrame(y_pred,index= X_test_prepared.index,columns=['SalePrice'])

In [None]:
y_pred

In [None]:
y_pred.to_csv('predictions_LR.csv')

#### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

parameters =[]
scores =[]
for i in range (1,30):
    parameters.append(i)
    forest_reg = RandomForestRegressor(n_estimators=i, random_state=42)
    forest_reg.fit(X_train, y_train)
    predictions = forest_reg.predict(X_val)
    forest_mse = mean_squared_error(predictions,y_val)
    forest_rmse = np.sqrt(forest_mse)
    scores.append(forest_rmse)

In [None]:
plt.plot(parameters,scores)
plt.xlabel('n_estimators')
plt.ylabel('Score')


In [None]:
print("Minimum score is ",min(scores))
print("with number of estimators = ",parameters[scores.index(min(scores))])

In [None]:
#predictions of model with minimum error
forest_reg = RandomForestRegressor(n_estimators=parameters[scores.index(min(scores))], random_state=42)
forest_reg.fit(X_train_prepared, train_labels)
predictions = forest_reg.predict(X_test_prepared)


In [None]:
y_pred = pd.DataFrame(predictions,index= X_test_prepared.index,columns=['SalePrice'])
y_pred

In [None]:
y_pred.to_csv('predictions_RFR.csv')

#### Support Vector Machine Regressor

In [None]:
from sklearn.svm import SVR
parameters=[]
scores=[]
for i in range(1,101,10):
    parameters.append(i-1)
    svr = SVR(C=i, epsilon=0.2)
    svr.fit(X_train, y_train)
    predictions = svr.predict(X_val)
    svr_mse = mean_squared_error(predictions,y_val)
    svr_rmse = np.sqrt(svr_mse)
    scores.append(svr_rmse)

In [None]:
plt.plot(parameters,scores)
plt.xlabel('C')
plt.ylabel('Score')

In [None]:
print("Minimum score is ",min(scores))
print("with C = ",parameters[scores.index(min(scores))])

In [None]:
svr = SVR(C=90, epsilon=0.2)
svr.fit(X_train_prepared, train_labels)
predictions = svr.predict(X_test_prepared)

In [None]:
y_pred = pd.DataFrame(predictions,index= X_test_prepared.index,columns=['SalePrice'])
y_pred

In [None]:
y_pred.to_csv('predictions_SVR.csv')