# Importing all required libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split , cross_val_score
from matplotlib import pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading train data as df

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df.head()

In [None]:
plt.pie(df.shape,labels=["rows","columns"],colors=["silver","orange"],pctdistance=0.7,autopct="%.1f%%",textprops={"weight":"bold"},wedgeprops={'edgecolor':"black","linewidth":2})
plt.title("Shape of dataframe")
plt.show()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.size

In [None]:
for columns in df:
    print(columns)

# Let's take all columns we need for our model

**And also let us check for nan values in that columns**

In [None]:
features = ["Id","MSSubClass","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","1stFlrSF",'2ndFlrSF','LowQualFinSF','GrLivArea',"FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold","YrSold"]
for i in features:
    print(df[i].isna().sum)

**So we dont have any nan values in features**

# Building a regression model

In [None]:
x = df[features]
y = df["SalePrice"]

# Split x and y into training and testing data

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Building a regression model

In [None]:
model = RandomForestRegressor(n_estimators=50)
model.fit(x_train,y_train)
print(model.score(x_test,y_test))
print(model.predict(x_test[:10]))

In [None]:
cross_val_score(RandomForestRegressor(n_estimators=50),x_train,y_train)

# Making submission file

In [None]:
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test_data

In [None]:
features = ["Id","MSSubClass","LotArea","OverallQual","OverallCond","YearBuilt","YearRemodAdd","1stFlrSF",'2ndFlrSF','LowQualFinSF','GrLivArea',"FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch","PoolArea","MiscVal","MoSold","YrSold"]


new_data = test_data[features]
new_data

# Description to all column in features

**MSSubClass   :   Identifies the type of dwelling involved in the sale.**

**LotArea   :   Lot size in square feet**

**OverallQual   :   Rates the overall material and finish of the house**

**YearBuilt   :   Original construction date**

**YearRemodAdd   :   Remodel date (same as construction date if no remodeling or additions)**

**1stFlrSF   :   First Floor square feet**

**2ndFlrSF   :   Second floor square feet**

**LowQualFinSF   :   Low quality finished square feet (all floors)**

**GrLivArea   :   Above grade (ground) living area square feet**

**FullBath   :   Full bathrooms above grade**

**HalfBath   :   Half baths above grade**

**Bedroom   :   Bedrooms above grade (does NOT include basement bedrooms)**

**Kitchen   :   Kitchens above grade**

**TotRmsAbvGrd   :   Total rooms above grade (does not include bathrooms)**

**Fireplaces   :   Number of fireplaces**

**WoodDeckSF   :   Wood deck area in square feet**

**OpenPorchSF   :   Open porch area in square feet**

**EnclosedPorch   :   Enclosed porch area in square feet**

**3SsnPorch   :   Three season porch area in square feet**

**ScreenPorch   :   Screen porch area in square feet**

**PoolArea   :   Pool area in square feet**

**MiscVal   :   Value of miscellaneous feature in dollars**

**MoSold   :   Month Sold (MM)**

**YrSold   :   Year Sold (YYYY)**

In [None]:
output = pd.DataFrame({"Id":test_data["Id"],"SalePrice":model.predict(new_data)})
output.to_csv("house_prediction_submission.csv",index=False)
print("completed")

# Upvote if you like my work , please