## Import required libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

## Read and Understand training dataset

In [None]:
# read dataset
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
# show head of dataset
train_df.head()

In [None]:
# shape of dataset
train_df.shape

In this dataset there is 1460 rows and 81 columns where 1 column is a target column which is ['SalePrice']

In [None]:
#check feature/columns in datset
train_df.columns

In [None]:
# get details of each features/columns
train_df.info()

From above info we can say that dataset is contain int,float,object dtype of columns.
Now check null values in dataset

In [None]:
#show null values using heatmap
plt.figure(figsize=(16,6))
sns.heatmap(train_df.isnull())

In this plot white line shows the null values .
Now we drop feature which is having more that 50% null values

## Handling Missing data

Now,   Int,float is replace by mean of that feature and object is replace by mode

In [None]:
# find sum of null values in each features
pd.set_option('display.max_rows',100)
train_df.isnull().sum()[:10]

In [None]:
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())

In [None]:
train_df.drop(['Alley'],axis=1,inplace=True)

In [None]:
# find sum of null values in each features
pd.set_option('display.max_rows',100)
train_df.isnull().sum()[9:20]

In [None]:
# find sum of null values in each features
pd.set_option('display.max_rows',100)
train_df.isnull().sum()[19:30]

In [None]:
train_df['MasVnrType'] = train_df['MasVnrType'].fillna(train_df['MasVnrType'].mode()[0])

In [None]:
train_df['MasVnrArea'] = train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean())

In [None]:
train_df['BsmtQual'] = train_df['BsmtQual'].fillna(train_df['BsmtQual'].mode()[0])

In [None]:
# find sum of null values in each features
pd.set_option('display.max_rows',100)
train_df.isnull().sum()[29:40]

In [None]:
train_df['BsmtCond'] = train_df['BsmtCond'].fillna(train_df['BsmtCond'].mode()[0])
train_df['BsmtExposure'] = train_df['BsmtExposure'].fillna(train_df['BsmtExposure'].mode()[0])
train_df['BsmtFinType1'] = train_df['BsmtFinType1'].fillna(train_df['BsmtFinType1'].mode()[0])
train_df['BsmtFinType2'] = train_df['BsmtFinType2'].fillna(train_df['BsmtFinType2'].mode()[0])

In [None]:
# find sum of null values in each features
pd.set_option('display.max_rows',100)
train_df.isnull().sum()[39:50]

In [None]:
train_df['Electrical'] = train_df['Electrical'].fillna(train_df['Electrical'].mode()[0])

In [None]:
# find sum of null values in each features
pd.set_option('display.max_rows',100)
train_df.isnull().sum()[55:70]

In [None]:
train_df.drop(['FireplaceQu'],axis=1,inplace=True)

In [None]:
train_df['GarageType'] = train_df['GarageType'].fillna(train_df['GarageType'].mode()[0])
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].mode()[0])
train_df['GarageFinish'] = train_df['GarageFinish'].fillna(train_df['GarageFinish'].mode()[0])
train_df['GarageQual'] = train_df['GarageQual'].fillna(train_df['GarageQual'].mode()[0])
train_df['GarageCond'] = train_df['GarageCond'].fillna(train_df['GarageCond'].mode()[0])

In [None]:
# find sum of null values in each features
pd.set_option('display.max_rows',100)
train_df.isnull().sum()[69:]

In [None]:
train_df.drop(['PoolQC','Fence', 'MiscFeature'],axis=1,inplace=True)

In [None]:
train_df.isnull().sum().max()

In [None]:
#drop ID column
train_df.drop(['Id'],axis=1,inplace=True)

In [None]:
train_df.shape

In [None]:
train_df.columns

In [None]:
#show null values using heatmap
plt.figure(figsize=(16,6))
sns.heatmap(train_df.isnull())

From this plot you can see that there is no null values in our training dataset

## Read and Understand testing dataset

In [None]:
#read dataset
test_dff = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
# show head
test_dff.head()

In [None]:
# find feature of dataset
test_dff.columns

In [None]:
# shape of datset
test_dff.shape

In [None]:
# get details of dataset
test_dff.info()

In [None]:
# finf null values count in each feature
test_dff.isnull().sum()

In [None]:
test_dff.isnull().sum()[:20]

In [None]:
test_df = test_dff.copy()

In [None]:
test_df['MSZoning'] = test_df['MSZoning'].fillna(test_df['MSZoning'].mode()[0])

test_df.drop(['Alley'],axis=1,inplace=True)

test_df['LotFrontage'] = test_df['LotFrontage'].fillna(test_df['LotFrontage'].mean())
test_df['Utilities'] = test_df['Utilities'].fillna(test_df['Utilities'].mode()[0])

In [None]:
test_df.isnull().sum()[20:45]

In [None]:
test_df['Exterior1st'] = test_df['Exterior1st'].fillna(test_df['Exterior1st'].mode()[0])
test_df['Exterior2nd'] = test_df['Exterior2nd'].fillna(test_df['Exterior2nd'].mode()[0])
test_df['MasVnrType'] = test_df['MasVnrType'].fillna(test_df['MasVnrType'].mode()[0])
test_df['BsmtQual'] = test_df['BsmtQual'].fillna(test_df['BsmtQual'].mode()[0])
test_df['BsmtCond'] = test_df['BsmtCond'].fillna(test_df['BsmtCond'].mode()[0])
test_df['BsmtExposure'] = test_df['BsmtExposure'].fillna(test_df['BsmtExposure'].mode()[0])
test_df['BsmtFinType1'] = test_df['BsmtFinType1'].fillna(test_df['BsmtFinType1'].mode()[0])
test_df['BsmtFinType2'] = test_df['BsmtFinType2'].fillna(test_df['BsmtFinType2'].mode()[0])

test_df['MasVnrArea'] = test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].mean())
test_df['BsmtFinSF1'] = test_df['BsmtFinSF1'].fillna(test_df['BsmtFinSF1'].mean())
test_df['BsmtFinSF2'] = test_df['BsmtFinSF2'].fillna(test_df['BsmtFinSF2'].mean())
test_df['BsmtUnfSF'] = test_df['BsmtUnfSF'].fillna(test_df['BsmtUnfSF'].mean())
test_df['TotalBsmtSF'] = test_df['TotalBsmtSF'].fillna(test_df['TotalBsmtSF'].mean())

In [None]:
test_df.isnull().sum()[45:65]

In [None]:
test_df['BsmtFullBath'] = test_df['BsmtFullBath'].fillna(test_df['BsmtFullBath'].mean())
test_df['BsmtHalfBath'] = test_df['BsmtHalfBath'].fillna(test_df['BsmtHalfBath'].mean())

test_df['KitchenQual'] = test_df['KitchenQual'].fillna(test_df['KitchenQual'].mode()[0])
test_df['Functional'] = test_df['Functional'].fillna(test_df['Functional'].mode()[0])

test_df.drop(['FireplaceQu'],axis=1,inplace=True)

test_df['GarageType'] = test_df['GarageType'].fillna(test_df['GarageType'].mode()[0])
test_df['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(test_df['GarageYrBlt'].mode()[0])
test_df['GarageFinish'] = test_df['GarageFinish'].fillna(test_df['GarageFinish'].mode()[0])
test_df['GarageQual'] = test_df['GarageQual'].fillna(test_df['GarageQual'].mode()[0])

test_df['GarageCars'] = test_df['GarageCars'].fillna(test_df['GarageCars'].mean())
test_df['GarageArea'] = test_df['GarageArea'].fillna(test_df['GarageArea'].mean())

test_df['GarageCond'] = test_df['GarageCond'].fillna(test_df['GarageCond'].mode()[0])

In [None]:
test_df.isnull().sum()[65:]

In [None]:
test_df.drop(['PoolQC','Fence', 'MiscFeature'],axis=1,inplace=True)

test_df['SaleType'] = test_df['SaleType'].fillna(test_df['SaleType'].mode()[0])


In [None]:
test_df.isnull().sum().max()

In [None]:
test_df.shape

In [None]:
test_df.columns

In [None]:
#show null values using heatmap
plt.figure(figsize=(16,6))
sns.heatmap(train_df.isnull())

In [None]:
test_df.to_csv('newtest.csv')

# Now our data is clean let's start to understand data

In [None]:
# get columns which is having object data type
train_obj_col = train_df.select_dtypes(include=['object'])

In [None]:
train_obj_col.columns

create a function which handling categorical feature and convert it into categorical feature

In [None]:
def category_onehot_encoding(multicolumns):
    df_final = final_df
    i=0
    for fields in multicolumns:
        print(fields)
        df1 = pd.get_dummies(final_df[fields],drop_first=True)
        
        final_df.drop([fields],axis=1,inplace=True)
        
        if i==0:
            df_final = df1.copy()
        else:
            df_final = pd.concat([df_final,df1],axis=1)
        i=i+1
    df_final = pd.concat([final_df,df_final],axis=1)
    
    return df_final

In [None]:
main_df = train_df.copy()

In [None]:
#combine test data
test_data = pd.read_csv('newtest.csv')

In [None]:
test_data.shape

In [None]:
train_data = train_df.copy()

In [None]:
train_data.shape

In [None]:
test_data.drop([test_data.columns[0]],axis=1,inplace=True)

In [None]:
test_data.shape

In [None]:
#now combine both data row wise 
final_df = pd.concat([train_data,test_data],axis=0)

In [None]:
final_df.shape

In [None]:
final_df = category_onehot_encoding(train_obj_col)

In [None]:
final_df.shape

In [None]:
#remove all duplicate collumns
final_df = final_df.loc[:,~final_df.columns.duplicated()]

In [None]:
final_df.shape

In [None]:
#devide dataset into test and train
df_train = final_df.iloc[:1460,:]
df_test = final_df.iloc[1460:,:]

In [None]:
# but noe in test dataset their is Saleprice columns so we want to drop this column
df_test.drop(['SalePrice'],axis=1,inplace=True)

In [None]:
df_train.shape

In [None]:
df_test.shape

# Building ML model  

In [None]:
x_train = df_train.drop(['SalePrice','Id'],axis=1)# create a new datafeam without Saleprice which is our training data
y_train = df_train['SalePrice']

In [None]:
x_train.head()


In [None]:
y_train.head()

In [None]:
demo = df_test.drop(['Id'],axis=1)

## Apply Xgboost 

In [None]:
import xgboost as xgb

In [None]:
Classifier  = xgb.XGBRegressor()
Classifier.fit(x_train,y_train)

In [None]:
#create a pickle file and save fit model if we do this we no need to train model again and again

In [None]:
import pickle
filename = 'trainedmodel.pkl'
pickle.dump(Classifier,open(filename,'wb'))

In [None]:
y_pred = Classifier.predict(demo)

In [None]:
y_pred

In [None]:
submission2 = pd.DataFrame({
    'Id':test_df['Id'],
    'SalePrice':y_pred
})

In [None]:
submission2.to_csv('house_price_prediction_submition_2.csv',index=False)

In [None]:
show_sub = pd.read_csv('house_price_prediction_submition_2.csv')
show_sub.tail()

### [I'm](https://www.linkedin.com/in/rushikesh-lavate/) a newcomer to kaggle please deliver me honest feedback so I can improve my self, give me upvote if you like. 
#### Thank You, everyone