In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/housing-prices-in-metropolitan-areas-of-india/Mumbai.csv')

In [None]:
df.shape

In [None]:
pd.set_option('display.max_columns',None)
df.head()

In [None]:
numeric_data=df.select_dtypes(exclude='object').drop(['Price'],axis=1).copy()
numeric_data.head()

In [None]:
categorical_data=df.select_dtypes(include='object')
categorical_data.head()

In [None]:
#Count plot (categorical, univariate analysis)
import matplotlib.pyplot as plt
import seaborn as sns 

df1=df.copy()
df1['Area'] = pd.cut(df1['Area'], bins=[0, 250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000, np.inf])
fig=plt.figure(figsize=(20,30))
for i,col in enumerate(numeric_data):
    fig.add_subplot(10,4,i+1)
    sns.countplot(df1[col])
    plt.xlabel(col,size=15)
    plt.xticks(rotation=90)
plt.tight_layout(pad=1)
plt.show()
 

In [None]:
fig,ax=plt.subplots(figsize=(23,20))
ax.set_title('Houses at each Location',fontsize=20)
sns.countplot(y='Location',data=df, order=df.Location.value_counts().index[:50])
ax.set_xlabel('Locations',fontsize=20)
ax.set_ylabel('No. of Houses',fontsize=20)
plt.show()

In [None]:
#Count plot (categorical, univariate analysis)
fig=plt.figure(figsize=(18,20))
sns.countplot(df1['Area'])
plt.xlabel('Area',fontsize=15)
plt.ylabel('No. of Houses',fontsize=15)
plt.xticks(rotation=40)
plt.tight_layout(pad=1)
plt.show()

In [None]:
df2=df.copy().replace(9,np.nan)
df2=df2.fillna(method='bfill',axis=0).fillna(0)
df2.head()

In [None]:
#Correlation
num=df2.select_dtypes(exclude='object')
numeric_correlation=num.corr()
plt.figure(figsize=(10,10))
plt.title('Correlation')
sns.heatmap(numeric_correlation>0.8, annot=True, square=True)

In [None]:
print(numeric_correlation['Price'].sort_values(ascending=False))

In [None]:
#dropping features due to high correlation
df2.drop(['Hospital','AC','Refrigerator','LiftAvailable'],axis=1,inplace=True)

In [None]:
#Missing Values
pd.DataFrame(df2.isnull().sum(), columns=['sum']).sort_values(by=['sum'],ascending=False).head(51)

In [None]:
plt.figure(figsize=(10,6))
plt.title("Distrubution of SalePrice")
dist = sns.distplot(df2['Price'],norm_hist=False)

In [None]:
plt.figure(figsize=(10,6))
plt.title("Distrubution of SalePrice")
dist = sns.distplot(np.log(df2['Price']),norm_hist=False)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor

In [None]:
from sklearn.model_selection import train_test_split

x = df2.drop(['Price'], axis=1) 
y = np.log1p(df2['Price'])
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1)

categorical_cols = [cname for cname in x.columns if
                    x[cname].dtype == "object"] 
                


numerical_cols = [cname for cname in x.columns if
                 x[cname].dtype in ['int64','float64','uint8']]


my_cols = numerical_cols + categorical_cols
X_train = X_train[my_cols].copy()
X_val = X_val[my_cols].copy()
print(categorical_cols,numerical_cols)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='constant'))
    ])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_transformer,numerical_cols),       
        ('cat',cat_transformer,categorical_cols),
        ])

In [None]:
# Reversing log-transform on y
def inv_y(transformed_y):
    return np.exp(transformed_y)

n_folds = 10

In [None]:
# XGBoost
model = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0,gamma=0, subsample=0.7,colsample_bytree=0.7,objective='reg:squarederror', nthread=-1,scale_pos_weight=1, seed=27, reg_alpha=0.00006)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])
clf.fit(X_train, y_train)
predict = clf.predict(X_val)
print('XGBoost: ' + str(mean_absolute_error(inv_y(predict), inv_y(y_val))))


# Lasso  
from sklearn.linear_model import LassoCV

model = LassoCV(max_iter=1e7,  random_state=14, cv=n_folds)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])
clf.fit(X_train, y_train)
predict = clf.predict(X_val)
print('Lasso: ' + str(mean_absolute_error(inv_y(predict), inv_y(y_val))))

# GradientBoosting   
model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=5)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])
clf.fit(X_train, y_train)
predict = clf.predict(X_val)
print('Gradient: ' + str(mean_absolute_error(inv_y(predict), inv_y(y_val))))

# Only using columns with no missing (not available) values

In [None]:
df3=df[['Price','Area','No. of Bedrooms','Resale','Location']].copy()
df3.head()

In [None]:
from sklearn.model_selection import train_test_split

x = df3.drop(['Price'], axis=1) 
y = np.log1p(df3['Price'])
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1)

categorical_cols = [cname for cname in x.columns if
                    x[cname].dtype == "object"] 
                


numerical_cols = [cname for cname in x.columns if
                 x[cname].dtype in ['int64','float64','uint8']]


my_cols = numerical_cols + categorical_cols
X_train = X_train[my_cols].copy()
X_val = X_val[my_cols].copy()
print(categorical_cols,numerical_cols)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='constant'))
    ])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_transformer,numerical_cols),       
        ('cat',cat_transformer,categorical_cols),
        ])

In [None]:
# XGBoost
model = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0,gamma=0, subsample=0.7,colsample_bytree=0.7,objective='reg:squarederror', nthread=-1,scale_pos_weight=1, seed=27, reg_alpha=0.00006)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])
clf.fit(X_train, y_train)
predict = clf.predict(X_val)
print('XGBoost: ' + str(mean_absolute_error(inv_y(predict), inv_y(y_val))))


# Lasso  
from sklearn.linear_model import LassoCV

model = LassoCV(max_iter=1e7,  random_state=14, cv=n_folds)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])
clf.fit(X_train, y_train)
predict = clf.predict(X_val)
print('Lasso: ' + str(mean_absolute_error(inv_y(predict), inv_y(y_val))))

# GradientBoosting   
model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=5)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])
clf.fit(X_train, y_train)
predict = clf.predict(X_val)
print('Gradient: ' + str(mean_absolute_error(inv_y(predict), inv_y(y_val))))