In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df1 = pd.read_csv('../input/lianjia/new.csv', encoding = "ISO-8859-1")
pd.set_option('display.max_columns', None)
df1.head()

We explore the data type of each feature to see which one needs to be modified. 
Based on our common sense, for example, number of rooms shouldn't be object.

In [None]:
df1.dtypes

Drop irrelavent columns for house price prediction

As for the price column, we drop it because price is just totalPrice/square, which are the features already in the dataset

In [None]:
df2 = df1.copy()

df2 = df2.drop(columns=['id','url','Cid','price'])

Remove outliers for better accuracy and fornormalization later on

In [None]:
df3 = df2.copy()

upper_limit = df3['totalPrice'].mean() + 3*df3['totalPrice'].std()
lower_limit = df3['totalPrice'].mean() - 3*df3['totalPrice'].std()

df3 = df3[df3['totalPrice'] <= upper_limit]
df3 = df3[df3['totalPrice'] >= lower_limit]

print(df2.shape)
print(df3.shape)

Only get the trade year from tradeTime column

In [None]:
df4 = df3.copy()

def tradeTimeMod(x):
    return x[0:4]

df4['tradeTime'] = df4['tradeTime'].apply(tradeTimeMod)

df4.head()

Convert some columns to the right data type

In [None]:
df5 = df4.copy()
 
df5['tradeTime'] = pd.to_numeric(df5['tradeTime'])
df5['livingRoom'] = df5['livingRoom'].apply(pd.to_numeric, errors='coerce')
df5['drawingRoom'] = df5['drawingRoom'].apply(pd.to_numeric, errors='coerce')
df5['bathRoom'] = df5['bathRoom'].apply(pd.to_numeric, errors='coerce')
df5['constructionTime'] = df5['constructionTime'].apply(pd.to_numeric, errors='coerce')

Split the floor column into floorType and floorHeight

In [None]:
df6  = df5.copy()

def floorType(x):
    return x.split(' ')[0]
def floorHeight(x):
    try:
        return int(x.split(' ')[1])
    except:
        return np.nan
    
df6['floorType'] = df6['floor'].apply(floorType)
df6['floorHeight'] = df6['floor'].apply(floorHeight)

df6 = df6.drop(columns=['floor'])

One hot encoding for features we consider nominal data

In [None]:
df7 = df6.copy()

df7 = df7[df7['buildingType']>=1]

cols_to_get_dummies = ['buildingType','renovationCondition','buildingStructure','elevator','district','floorType']
df7 = pd.get_dummies(data=df7, columns=cols_to_get_dummies)

In [None]:
df8 = df7.copy()

df8 = df8.dropna()

print(df8.shape)
print(df1.shape)

In [None]:
X = df8.drop(columns=['totalPrice'])
y = df8['totalPrice']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=69)

Normalize columns for features we consider continuos data

In [None]:
cols_to_norm = ['Lng','Lat','DOM','followers','square','livingRoom','drawingRoom','kitchen','bathRoom',
                'ladderRatio','fiveYearsProperty','subway','communityAverage','floorHeight']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])

X_train.head()

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print(lr_model.score(X_train, y_train))
print(lr_model.score(X_test, y_test))

In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

print(ridge_model.score(X_train, y_train))
print(ridge_model.score(X_test, y_test))

In [None]:
from sklearn.linear_model import Lasso

lasso_model = Lasso()
lasso_model.fit(X_train, y_train)

print(lasso_model.score(X_train, y_train))
print(lasso_model.score(X_test, y_test))

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr_model = DecisionTreeRegressor()
dtr_model.fit(X_train, y_train)

print(dtr_model.score(X_train, y_train))
print(dtr_model.score(X_test, y_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfg_model = RandomForestRegressor()
rfg_model.fit(X_train, y_train)

print(rfg_model.score(X_train, y_train))
print(rfg_model.score(X_test, y_test))