# Predict sales prices

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.set_option('display.max_columns',100)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.model_selection import RepeatedKFold,cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train.head()

In [None]:
print('Number of features: {}'.format(train.shape[1]))
print('Number of entries: {}'.format(train.shape[0]))

### Plotting heatmap of missing values

In [None]:
plt.figure(figsize=(17, 5))
sns.heatmap(train.isnull(), cbar=True, cmap='Set3')
plt.xlabel("Column_Name", size=14, weight="bold")
plt.title("Places of missing values in column",fontweight="bold",size=14)
plt.show()

### Percentage of missing values in each column of train dataset


In [None]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
#dealing with missing data
train = train.drop((missing_data[missing_data['Total'] > 1]).index,1)
train = train.drop(train.loc[train['Electrical'].isnull()].index)
train.isnull().sum().max() #just checking that there's no missing data missing...

Wow, we dont have any missing values. Now let us do the same for test data.

### Percentage of missing values in each column of test dataset


In [None]:
#missing data
total_ = test.isnull().sum().sort_values(ascending=False)
percent_ = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_data_ = pd.concat([total_, percent_], axis=1, keys=['Total', 'Percent'])
missing_data_.head(20)

In [None]:
#dealing with missing data
test = test.drop((missing_data_[missing_data_['Total'] > 1]).index,1)
test = test.drop(test.loc[test['Electrical'].isnull()].index)
test.isnull().sum().max() #just checking that there's no missing data missing...

### Check statics of the train dataset


In [None]:
train.describe(include='all')

The sale price ranges in between 34900 and 755000

In [None]:
### Lets plot histogram for prices less than 500000
hist_price1=train["SalePrice"][train["SalePrice"]<500000].hist()

In [None]:
### Lets plot histogram for prices more than 500000
hist_price2=train["SalePrice"][train["SalePrice"]>500000].hist()

Handling outliers by removing entries having price > 500000


In [None]:
train = train[train["SalePrice"]<500000]
train

### Remove duplicates

In [None]:
##Data cleaning
#remove duplicates if any
train.duplicated().sum()
train.drop_duplicates(inplace=True)

### Finding all columns with Categorical & Numerical values

In [None]:
#For train data
categorical_train=[cat for cat in train.columns if train[cat].dtype=='object']
numerical_train=[cat for cat in train.columns if train[cat].dtype=='int64' or train[cat].dtype=='float64']

#For test data
categorical_test=[cat for cat in test.columns if test[cat].dtype=='object']
numerical_test=[cat for cat in test.columns if test[cat].dtype=='int64' or test[cat].dtype=='float64']

In [None]:
print(categorical_train, '\n\n',categorical_test)

### Using the StandardScaler library to Standardize the numeric values

In [None]:
ss= StandardScaler()
train[numerical_train]= ss.fit_transform(train[numerical_train])
test[numerical_test]= ss.fit_transform(test[numerical_test])
#train.head()
test.head()

### Handling Categorical Data using Get_Dummies()

In [None]:
train1= pd.get_dummies(train, columns=categorical_train, drop_first= True)
test1= pd.get_dummies(test, columns=categorical_test, drop_first= True)
train1


### Concatenating the Original Dataset & the One after creating Dummies(get_dummies() creates a new DF containing JUST the dummies

In [None]:
train2=pd.concat([train,train1],axis=1)
test2=pd.concat([test,test1],axis=1)

### Dropping the columns already concatenated after Get_Dummies()

In [None]:
train=train2.drop(categorical_train,axis=1)
test=test2.drop(categorical_test,axis=1)
# test123=test.copy()
# train
id=test['Id'].iloc[:,1]
id

In [None]:
test

In [None]:
y=train['SalePrice'].iloc[:,1]
X=train.drop(['Id','SalePrice'],axis=1)

### Splitting the dataset into test and training data

In [None]:
#splitting the dataset into test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
print('Dimensions of the training feature matrix: {}'.format(X_train.shape))
print('Dimensions of the training target vector: {}'.format(y_train.shape))
print('Dimensions of the test feature matrix: {}'.format(X_test.shape))
print('Dimensions of the test target vector: {}'.format(y_test.shape))

### Building a regression model

In [None]:
# Gradient Boosting Regressor
gbreg=GradientBoostingRegressor()
gbreg.fit(X_train,y_train)

y_pred_gb=gbreg.predict(X_test)

from sklearn.metrics import r2_score
print("R2 score: ",r2_score(y_test,y_pred_gb)*100)
print("RMSE: ",np.sqrt(mean_squared_error(y_test,y_pred_gb)))


#Error
error_diff = pd.DataFrame({'Actual Values': np.array(y_test), 'Predicted Values': y_pred_gb})
print(error_diff.head(5))

#Visualize the error
df1 = error_diff.head(25)
df1.plot(kind='bar',figsize=(10,7))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
#Random forest regression 

regrRM2 = RandomForestRegressor(n_estimators=200, max_depth = 50, min_samples_split = 5,min_samples_leaf =4)
regrRM2.fit(X_train, y_train)

y_pred_rf=regrRM2.predict(X_test)

from sklearn.metrics import r2_score
print("R2 score: ",r2_score(y_test,y_pred_rf)*100)
print("RMSE: ",np.sqrt(mean_squared_error(y_test,y_pred_rf)))

#Error
error_diff = pd.DataFrame({'Actual Values': np.array(y_test), 'Predicted Values': y_pred_rf})
print(error_diff.head(5))

#Visualize the error
df1 = error_diff.head(25)
df1.plot(kind='bar',figsize=(10,7))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()