# **House Prices - Advanced Regression Techniques**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
pd.pandas.set_option('display.max_columns',None)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## **Loading Dataset**

In [1]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train.head()

In [1]:
train.shape

## **Finding missing values**

In [1]:
missing = train.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace= True)
plt.figure(figsize=(9,8))
missing.plot.bar()
plt.show()

In [1]:
train.info()

## **Visualization of the Data**

In [1]:
num_feature = [feature for feature in train.columns if train[feature].dtype != 'O']
print('Total nunmber of numerical feature in the dataset:', len(num_feature))
train[num_feature].head()

In [1]:
year_feature = [feature for feature in num_feature if 'Yr' in feature or 'Year' in feature]
year_feature

In [1]:
plt.figure(figsize=(9,8))
train.groupby('YrSold')['SalePrice'].median().plot()
plt.show()

In [1]:
for feature in year_feature:
    if feature!='YrSold':
        dat=train.copy()
        dat[feature]=dat['YrSold']-dat[feature]
        plt.figure(figsize=(9,8))
        plt.scatter(dat[feature],dat['SalePrice'])
        plt.xlabel('SalesPrice')
        plt.ylabel(feature)
        plt.show()

In [1]:
discrete_num = [feature for feature in num_feature if len(train[feature].unique())<25 and feature not in year_feature +['Id']]
print('Total  number of discrete_num:', len(discrete_num))

In [1]:
for feature in discrete_num:
    data=train.copy()
    plt.figure(figsize=(9,8))
    data.groupby(feature)['SalePrice'].median().plot.bar(color='turquoise')
    plt.xlabel(feature)
    plt.ylabel('Salesprice')
    plt.title(feature)
    plt.show()

In [1]:
conti_num=[feature for feature in num_feature if feature not in discrete_num+year_feature +['Id']]
conti_num

In [1]:
for feature in conti_num:
    data=train.copy()
    plt.figure(figsize=(9,8))
    data[feature].hist(bins=30)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.title(feature)
    plt.show()

In [1]:
for feature in conti_num:
    data=train.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data['SalePrice']= np.log(data['SalePrice'])
        plt.figure(figsize=(9,8))
        plt.scatter(data[feature],data['SalePrice'],color='greenyellow')
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.title(feature)
        plt.show()

In [1]:
for feature in conti_num:
    data=train.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        plt.figure(figsize=(9,8))
        data.boxplot(column=feature)
        plt.title(feature)
        plt.show()

In [1]:
cat_feature = [feature for feature in train.columns if train[feature].dtype=='O']
cat_feature

In [1]:
for feature in cat_feature:
    data=train.copy()
    plt.figure(figsize=(9,8))
    data.groupby(feature)['SalePrice'].median().plot.bar(color='thistle')
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

## **Feature Engineering**

In [1]:
print(missing)

In [1]:
cat_missing= [feature for feature in train.columns if train[feature].isnull().sum() >1 and train[feature].dtype=='O']
train[cat_missing].isnull().sum()

In [1]:
def miss_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

train=miss_cat_feature(train,cat_missing)

train[cat_missing].isnull().sum()

In [1]:
num_missing= [feature for feature in train.columns if train[feature].isnull().sum() >1 and train[feature].dtype!='O']
train[num_missing].isnull().sum()

In [1]:
for feature in num_missing:
    train[feature]= train[feature].fillna(train[feature].median())
train[num_missing].isnull().sum()

In [1]:
for feature in['YearBuilt','YearRemodAdd','GarageYrBlt']:
    train[feature]=train['YrSold']-train[feature]
    
train[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

In [1]:
for feature in conti_num:
    if 0 in train[feature].unique():
        pass
    else:
        train[feature] = np.log(train[feature])

        
train.head()

In [1]:
cat_feature=[feature for feature in train.columns if train[feature].dtype=='O']
cat_feature

In [1]:
for feature in cat_feature:
    labels_ordered=train.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    train[feature]=train[feature].map(labels_ordered)

In [1]:
train.info()

In [1]:
train = train.fillna(0)

In [1]:
feature_scale=[feature for feature in train.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(train[feature_scale])

In [1]:
scaler.transform(train[feature_scale])

In [1]:
data = pd.concat([train[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(train[feature_scale]), columns=feature_scale)],
                    axis=1)

## **Splitting Data**

In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(train.iloc[:,:-1],train['SalePrice'],test_size=0.3,random_state=0)

## **LinearRegression**

In [1]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
mse

## **RandomForestRegressor**

In [1]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=200, random_state=1)
rfr.fit(X_train,y_train)
y_pred=rfr.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
mse

## **DecisionTreeRegressor**

In [1]:
from sklearn.tree import DecisionTreeRegressor
dtr=  DecisionTreeRegressor(random_state=1)
dtr.fit(X_train,y_train)
y_pred=dtr.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
mse

## **Test Data Feature Engineering**

In [1]:
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test.head()

In [1]:
cat_missing= [feature for feature in test.columns if test[feature].isnull().sum() >1 and test[feature].dtype=='O']
test[cat_missing].isnull().sum()

In [1]:
def miss_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

test=miss_cat_feature(test,cat_missing)

test[cat_missing].isnull().sum()

In [1]:
num_missing= [feature for feature in test.columns if test[feature].isnull().sum() >1 and test[feature].dtype!='O']
test[num_missing].isnull().sum()

In [1]:
for feature in num_missing:
    test[feature]= test[feature].fillna(test[feature].median())
test[num_missing].isnull().sum()

In [1]:
for feature in['YearBuilt','YearRemodAdd','GarageYrBlt']:
    test[feature]=test['YrSold']-test[feature]
    
test[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

In [1]:
num_feature = [feature for feature in test.columns if test[feature].dtype != 'O']
print('Total nunmber of numerical feature in the dataset:', len(num_feature))
test[num_feature].head()

In [1]:
year_feature = [feature for feature in num_feature if 'Yr' in feature or 'Year' in feature]
year_feature

In [1]:
discrete_num = [feature for feature in num_feature if len(test[feature].unique())<25 and feature not in year_feature +['Id']]
print('Total  number of discrete_num:', len(discrete_num))

In [1]:
conti_num=[feature for feature in num_feature if feature not in discrete_num+year_feature +['Id']]
conti_num

In [1]:
for feature in conti_num:
    if 0 in test[feature].unique():
        pass
    else:
        test[feature] = np.log(train[feature])

        
test.head()

In [1]:
cat_feature=[feature for feature in test.columns if test[feature].dtype=='O']
cat_feature

In [1]:
for feature in cat_feature:
    labels_ordered=train.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    test[feature]=test[feature].map(labels_ordered)

In [1]:
test = test.fillna(0)

In [1]:
feature_scale=[feature for feature in test.columns if feature not in ['Id']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(test[feature_scale])

In [1]:
scaler.transform(test[feature_scale])

In [1]:
data = pd.concat([test[['Id']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(test[feature_scale]), columns=feature_scale)],
                    axis=1)

In [1]:
test.head()

In [1]:
X_pred=test
ID=test['Id']

## **Predicting & Submiting**

In [1]:
final_pred=rfr.predict(X_pred)

In [1]:
submit=pd.DataFrame()
submit['Id']=ID
submit['SalePrice']=final_pred
submit.to_csv('houseprice_projection.csv',index=False)