In [None]:
import sys
print(sys.executable)

In [None]:
## Basics
import numpy as np
import pandas as pd
import re
import datetime as dt
import pickle
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

## Regression using scikit-learn
from sklearn.linear_model import Ridge, Lasso
from sklearn import metrics
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder, scale, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor

## Setup
warnings.filterwarnings('ignore') 
plt.style.use(['seaborn-pastel'])
%matplotlib inline
%autosave 10

In [None]:
beijing = pd.read_csv("../input/lianjia/new.csv", parse_dates=["tradeTime"], encoding='gb2312')
print(beijing.columns.values)
beijing.head()

In [None]:
beijing.describe()

In [None]:
beijing.drop("Cid", axis = 1, inplace=True)
beijing.drop("id", axis = 1, inplace=True)
beijing.drop("url", axis = 1, inplace=True)
beijing.drop("price", axis = 1, inplace=True)

In [None]:
beijing.info()

## Data cleaning

#### Cleaning factor variables

In [None]:
for row in range(beijing.shape[1]):
    if(beijing.iloc[:,row].dtype=="O"):
        print("{}: {}\n".format(beijing.columns[row],beijing.iloc[:,row].unique()))

In [None]:
beijing.iloc[:,11] = np.array(beijing.iloc[:,11].str.extract("([0-9]+)")).reshape(-1,1)

In [None]:
sel = [7,8,10,11,13]
for s in sel:
    beijing.iloc[:,s] = pd.to_numeric(beijing.iloc[:,s], errors='coerce')

#### Removing missing values

In [None]:
beijing.isna().mean()*100

Understanding distribution of DOM column

In [None]:
plt.clf()
sns.boxplot(y = beijing["DOM"])
plt.xlabel("DOM")
plt.ylabel("distribution")
plt.show()

In [None]:
plt.clf()
sns.kdeplot(data = beijing.loc[:,"DOM"])
plt.show()

In [None]:
beijing["DOM"].quantile([0.25,0.5,0.75,1])

Majority of the data sits outside the upper whiskers, we impute the missing values using the median

In [None]:
beijing["DOM"] = beijing["DOM"].fillna(beijing["DOM"].median())

Removing the other records from the dataset having missing values

In [None]:
beijing.dropna(axis = 0, how = "any", inplace = True)

In [None]:
beijing = beijing.astype({"DOM":"int64","livingRoom":"int64","drawingRoom":"int64","bathRoom":"int64","floor":"int64",
                          "buildingType":"O","constructionTime":"int64","renovationCondition":"O","buildingStructure":"O",
                          "elevator":"O", "fiveYearsProperty":"O","subway":"O","district":"O"}, errors = 'ignore')

In [None]:
beijing.info()

Moving response feature to the right end of the dataframe

In [None]:
feature_df = beijing.drop("totalPrice", axis = 1)
response_df = beijing["totalPrice"]
beijing = feature_df.merge(response_df, left_index = True, right_index = True)
beijing.head()

#### Correcting factor labels

In [None]:
beijing.info()

In [None]:
print("Shape of dataframe:    {}".format(beijing.shape))
print(beijing.apply(lambda x: x.unique()))

In [None]:
beijing = beijing.apply(lambda x: x.astype("int64") if(x.dtype =="O") else x)

In [None]:
buildingType = {
    1:"Tower",
    2:"Bunglow",
    3:"Plate/Tower",
    4:"Plate"
}
renovationCondition = {
    1:"Other",
    2:"Rough",
    3:"Simplicity",
    4:"Hardcover"
}
buildingStructure = {
    1:"Unavailable",
    2:"Mixed",
    3:"Brick/Wood",
    4:"Brick/Concrete",
    5:"Steel",
    6:"Steel/Concrete"
}
elevator = {
    1:"Present",
    0:"Absent"
}
subway = {
    1:"Nearby",
    0:"Far"
}
fiveYearProperty = {
    1:"Ownership<5y",
    0:"Ownership>5y"
}
district = {
    1 : "DongCheng",
    2 : "FengTai",
    3 : "DaXing",
    4 : "FaXing",
    5 : "FangShang",
    6 : "ChangPing",
    7 : "ChaoYang",
    8 : "HaiDian",
    9 : "ShiJingShan",
    10 : "XiCheng",
    11 : "TongZhou",
    12 : "ShunYi",
    13 : "MenTouGou"
}
correct_label = {
    11:buildingType,
    13:renovationCondition,
    14:buildingStructure,
    16:elevator,
    17:fiveYearProperty,
    18:subway,
    19:district
}

In [None]:
for key,val in correct_label.items():
    print(key,val)

In [None]:
for key,val in correct_label.items():
    beijing.iloc[:,key] = beijing.iloc[:,key].replace(val)

In [None]:
print(beijing.shape)
print(beijing.apply(lambda x: x.unique()))

In [None]:
beijing = beijing.astype({"livingRoom":"O","drawingRoom":"O","bathRoom":"O","kitchen":"O"}, errors = 'ignore')

In [None]:
beijing.reset_index(inplace=True, drop = True)

## Understanding the data

In [None]:
names = beijing.columns
cat_idx = []
num_idx = []
for i in range(20):
    column = beijing.iloc[:,i]
    if(column.dtype=="O"):
        cat_idx.append(i)
        print("{}\nUnique values:\n{}\n".format(names[i],column.unique()))
    elif(column.dtype!='<M8[ns]'):
        num_idx.append(i)

In [None]:
plt.clf()
sns.pairplot(beijing.iloc[:,num_idx], corner=True)
plt.show()

In [None]:
beijing.iloc[:,num_idx[2:]].describe()

In [None]:
beijing.iloc[:,cat_idx].describe()

In [None]:
temp = beijing["totalPrice"].groupby(beijing["district"]).agg([np.mean,np.min,np.max,np.median])
temp

## Saving the cleaned dataset

In [None]:
beijing.drop("Lat", axis = 1, inplace=True)
beijing.drop("Lng", axis = 1, inplace=True)

In [None]:
beijing.info()

In [None]:
dataClasses = beijing.dtypes.to_dict()

In [None]:
beijing.to_csv('cleaned_beijing.csv', encoding='utf-8')

## Preprocessing

In [None]:
beijing = pd.read_csv("./cleaned_beijing.csv", parse_dates=["tradeTime"], index_col = 0, encoding='utf-8')
beijing.head()

In [None]:
beijing = beijing.astype(dataClasses, errors = 'ignore')

In [None]:
cat_mask = beijing.dtypes == object
cat_cols = beijing.columns[cat_mask].tolist()
l_enc = OneHotEncoder(handle_unknown='ignore', sparse = False)
l_enc.fit(beijing[cat_cols])
def encode_df(df, enc, cat_cols):
    encoded_array = enc.transform(df[cat_cols])
    encoded_df = pd.DataFrame(encoded_array, columns = enc.get_feature_names(input_features = cat_cols))
    feature_df = df.drop("totalPrice", axis = 1)
    response_df = df["totalPrice"]
    feature_df = feature_df.merge(encoded_df, left_index = True, right_index = True).drop(columns = cat_cols, axis=1)
    df = feature_df.merge(response_df, left_index = True, right_index = True)
    df['tradeTime'] = df['tradeTime'].astype('datetime64[ns]')
    df['tradeTime'] = df['tradeTime'].map(dt.datetime.toordinal)
    return(df)

In [None]:
l_beijing = encode_df(beijing, l_enc, cat_cols)

In [None]:
l_beijing.shape

## Modelling

In [None]:
def split_df(df):
    X = df.drop("totalPrice", axis = 1).values
    y = df["totalPrice"].values.reshape(-1,1)
    return(X,y)

In [None]:
X,y = split_df(l_beijing)
print(X.shape,y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=193)

### Feature selection using lasso 

In [None]:
lasso = Lasso(alpha = 0.005, normalize = True)

In [None]:
lasso.fit(X_train, y_train)

In [None]:
lasso_pred = lasso.predict(X_test)

In [None]:
lasso.score(X_test, y_test)

In [None]:
lasso_coef = lasso.fit(X, y).coef_

In [None]:
names_coef = [x for x in l_beijing.columns[0:68]]
plt.clf()
plt.figure(figsize=(20,10))
plt.plot(range(len(names_coef)), lasso_coef)
plt.xticks(range(len(names_coef)), names_coef, rotation = 90)
for i in range(0,68):
    plt.axvline(x = i, color = "gray", linestyle='--', alpha = 0.4)
plt.axhline(y = 0, color = "skyblue")
plt.ylabel("Coefficients")
plt.show()

elevator, subway don't seem like a significant features driving the price of houses

In [None]:
s_vars = np.where(abs(lasso_coef)>0)[0].tolist() # significant variables

Important numerical features

In [None]:
for i in s_vars:
    print("{} ".format(names_coef[i],lasso_coef[i]))

In [None]:
s_var_names = ['tradeTime', 'square','communityAverage', 'livingRoom', 'drawingRoom',
               'kitchen','bathRoom', 'buildingType', 'renovationCondition', 'buildingStructure', 
               'elevator','fiveYearsProperty', 'subway','district']
len(s_var_names)

In [None]:
beijing_subset = beijing.loc[:,s_var_names+['totalPrice']]
print(beijing_subset.shape)

In [None]:
cat_mask = beijing_subset.dtypes == object
cat_cols = beijing_subset.columns[cat_mask].tolist()
r_enc = OneHotEncoder(handle_unknown='ignore', sparse = False)
r_enc.fit(beijing_subset[cat_cols])
r_beijing = encode_df(beijing_subset, r_enc, cat_cols)
print("Encoded data:")
print(r_beijing.shape)

In [None]:
X, y = split_df(r_beijing)
print(X.shape,y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=193)

### Using regularized ridge regression to fit the model

Plot to identify best alpha

In [None]:
def display_plot(cv_scores, cv_scores_std):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.plot(alpha_space, cv_scores)

    std_error = cv_scores_std / np.sqrt(10)

    ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
    ax.set_ylabel('CV Score +/- Std Error')
    ax.set_xlabel('Alpha')
    ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
    ax.set_xlim([alpha_space[0], alpha_space[-1]])
    ax.set_xscale('log')
    plt.show()

In [None]:
ridge = Ridge(alpha = 0.1, normalize = True)

In [None]:
ridge.fit(X_train, y_train)

In [None]:
ridge_pred = ridge.predict(X_test)

In [None]:
ridge.score(X_test, y_test)

In [None]:
alpha_space = np.logspace(-4, 1, 50)
ridge_scores = []
ridge_scores_std = []
for alpha in alpha_space:
    ridge.alpha = alpha
    ridge_cv_scores = cross_val_score(ridge, X, y, cv = 5)
    ridge_scores.append(np.mean(ridge_cv_scores))
    ridge_scores_std.append(np.std(ridge_cv_scores))

In [None]:
display_plot(ridge_scores, ridge_scores_std)

In [None]:
print("Suggested alpha: {}".format(float(alpha_space[np.where(max(ridge_cv_scores)==ridge_cv_scores)[0]])))

In [None]:
ridge.alpha =  float(alpha_space[np.where(max(ridge_cv_scores)==ridge_cv_scores)[0]])

In [None]:
ridge.fit(X_train, y_train)

In [None]:
ridge_pred = ridge.predict(X_test)

In [None]:
ridge.score(X_test, y_test)

### Using Ensemble method - GradientBoostingRegressor

In [None]:
model_parameters = {
    'n_estimators': 500,
    'max_depth': 6,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'ls'
}
gbReg = GradientBoostingRegressor(**model_parameters)

In [None]:
gbReg.fit(X_train, y_train)

In [None]:
gbReg.score(X_test, y_test)

## Saving the model

In [None]:
filename = 'sklearn_model.sav'
encoder_name = 'one_hot_encoder.sav'
pickle.dump(gbReg, open(filename, 'wb'))
pickle.dump(r_enc, open(encoder_name, 'wb'))

### Loading the model 
Creating a pipeline for price prediction app deployed on Heroku: [house-price-ar32-app](https://house-price-ar32.herokuapp.com/)

In [None]:
mdl = pickle.load(open(filename, 'rb'))
enc = pickle.load(open(encoder_name, 'rb'))
mdl.score(X_test, y_test)

Simulating data

In [None]:
beijing.loc[1,s_var_names+['totalPrice']]

In [None]:
df_dict = {
    'tradeTime':'2016-07-28',
    'square':132.38,
    'communityAverage':71539,
    'livingRoom':'2',
    'drawingRoom':'2',
    'kitchen':'1',
    'bathRoom':'2',
    'buildingType':'Tower',
    'renovationCondition':'Hardcover',
    'buildingStructure':'Steel/Concrete',
    'elevator': 'Present',
    'fiveYearsProperty':'Ownership<5y',
    'subway': 'Far',
    'district':'ChaoYang',
    'totalPrice': 530
}    
df = pd.DataFrame(df_dict, index = [0])
df_enc = encode_df(df, enc, cat_cols)
X, y = split_df(df_enc)
print(X.shape,y.shape)
print("Predicted value : {}\nActual value    : {}".format(mdl.predict(X)[0], y[0][0]))

## EDA 
### for application UX implementation

In [None]:
df = beijing.loc[:,s_var_names+['totalPrice']]

In [None]:
print(np.quantile(df["totalPrice"], [0.25,0.5,0.75]))