# Imports

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, RidgeCV
from yellowbrick.regressor import AlphaSelection
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor

In [None]:
df = pd.read_csv("../input/house-price-prediction-challenge/train.csv")
df.head()

# EDA

In [None]:
df.info()

In [None]:
df.isna().sum()

There are no Missing Values

---

In [None]:
df.POSTED_BY.value_counts().plot(kind='barh')

**Infrences**: In most of the cases there is a Middle man

In [None]:
ax = df.groupby('UNDER_CONSTRUCTION').sum().iloc[:,-1].plot(kind='bar')

**Infrence**: Furnished Houses are more costlier than NON-Furnished

In [None]:
ax = df.groupby('UNDER_CONSTRUCTION').sum().iloc[:, 2].plot(kind='bar')

**Infrence**: Houses are more likely to be on sale after fully furnished

In [None]:
plt.figure(figsize=(20 ,5))
df.ADDRESS = df.ADDRESS.apply(lambda x: x.split(',')[-1])
df.ADDRESS.value_counts()[:20].plot(kind='bar')
plt.show()

**Infrences**: Top 20 cities, based on count of houses

In [None]:
plt.figure(figsize=(20 ,20))
df.groupby(['ADDRESS']).sum().SQUARE_FT.nlargest(10).plot(kind='pie')
plt.show()

**Infrences**: Top 10 cities, based on sum of area of house. **Banglore** TOPS

In [None]:
plt.figure(figsize=(20 ,8))
df.groupby(['ADDRESS']).sum().UNDER_CONSTRUCTION.nlargest(10).plot(kind='bar')
plt.show()

**Infrences**: Lalitpur has most unfurnished Houses followed by mumbai and banglore

---

In [None]:
# Convert BHK>=5 to 5

df['BHK_NO.'] = df['BHK_NO.'].apply(lambda x: x if x < 5 else 5)
df['BHK_NO.'].value_counts()

In [None]:
# Converts not so common cities to others
# Keeps only top 15 cities 

top_cities = df.ADDRESS.value_counts().nlargest(15).index
df.ADDRESS = df.ADDRESS.apply(lambda x: x if x in top_cities else 'Other')
df.ADDRESS.value_counts()

In [None]:
df.head()

In [None]:
def label_encoder(df, column_name):
    label_encoder = preprocessing.LabelEncoder()

    df[column_name]= label_encoder.fit_transform(df[column_name])
    print(column_name)
    for i in range(len(df[column_name].unique())):
        print("For {} : {}".format(i, label_encoder.inverse_transform([i])))
    print('-'*10)
    print(df[column_name].value_counts())
    print('-'*10)
    
    return df[column_name], label_encoder

In [None]:
df['POSTED_BY'], label_encoder_posted_by = label_encoder(df, 'POSTED_BY')
df['BHK_OR_RK'], label_encoder_posted_by = label_encoder(df, 'BHK_OR_RK')
df['ADDRESS'], label_encoder_posted_by = label_encoder(df, 'ADDRESS')

---

In [None]:
df.head()

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
scaler = StandardScaler()
norm = Normalizer()

X_stand = scaler.fit_transform(X)
X_norm = norm.fit_transform(X)

In [None]:
X_stand_train, X_stand_test, y_stand_train, y_stand_test = train_test_split(X_stand, y, test_size=0.33, random_state=42)
X_norm_train, X_norm_test, y_norm_train, y_norm_test = train_test_split(X_norm, y, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

--- 
# Model

In [None]:
def make_model(name, model):
    clf = model.fit(X_train, y_train)
    clf_stand = model.fit(X_stand_train, y_stand_train)
    clf_norm = model.fit(X_norm_train, y_norm_train)
    y_pred = clf.predict(X_test)
    y_stand_pred = clf_stand.predict(X_stand_test)
    y_norm_pred = clf_norm.predict(X_norm_test)
    print(name)
    print("For orignal:")
    print("R2 : ",r2_score(y_test, y_pred))
    print("MSE : ",mean_squared_error(y_test, y_pred))
    print("-"*10)
    print("For stand:")
    print("R2 : ",r2_score(y_stand_test, y_stand_pred))
    print("MSE : ",mean_squared_error(y_stand_test, y_stand_pred))
    print("-"*10)
    print("For normalized:")
    print("R2 : ",r2_score(y_norm_test,y_norm_pred))
    print("MSE : ",mean_squared_error(y_norm_test, y_norm_pred))
    print("="*10)
    return clf, clf_stand, clf_norm

In [None]:
_,_, Linear = make_model("Linear Regression", LinearRegression())
_,_, Lasso = make_model("Lasso Regression", LassoCV(cv=10))
_,_, Ridge = make_model("Ridge Regression", RidgeCV((0.1, 1.0, 5.0, 10)))
_,_, ElasticNet = make_model("Elastic Net", ElasticNetCV(l1_ratio=0.5))
_,_, ElasticNet = make_model("Elastic Net", ElasticNetCV())
_,_, Gradient = make_model("Gradient", GradientBoostingRegressor())
_,_, AdaBoost = make_model("Ada", AdaBoostRegressor())


I will be using Gradient Model, has it has Max and Positive R2 Value

In [None]:
estimators = [('Gradient', Gradient),('svr', AdaBoost)]
reg = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
reg.fit(X_norm_train, y_norm_train)
y_norm_pred = reg.predict(X_norm_test)

In [None]:
print("For normalized:")
print("R2 : ",r2_score(y_norm_test,y_norm_pred))
print("MSE : ",mean_squared_error(y_norm_test, y_norm_pred))
print("="*10)