# Notebook 2: Modelling, Conclusion and Recommendations

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import f_classif
from sklearn.neighbors import KNeighborsRegressor

In [2]:
clean = pd.read_csv("/Users/eugenia/Desktop/DSI-SG-39/project_2/cleaned_train_data.csv", low_memory=False)
clean.head()

Unnamed: 0,id,tranc_yearmonth,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,...,vacancy,pri_sch_affiliation,pri_sch_latitude,pri_sch_longitude,sec_sch_nearest_dist,sec_sch_name,cutoff_point,affiliation,sec_sch_latitude,sec_sch_longitude
0,88471,2016-05,KALLANG/WHAMPOA,4 ROOM,3B,UPP BOON KENG RD,10 TO 12,90.0,Model A,2006,...,78,1,1.317659,103.882504,1138.633422,Geylang Methodist School,224,0,1.317659,103.882504
1,122598,2012-07,BISHAN,5 ROOM,153,BISHAN ST 13,07 TO 09,130.0,Improved,1987,...,45,1,1.349783,103.854529,447.894399,Kuo Chuan Presbyterian Secondary School,232,0,1.35011,103.854892
2,170897,2013-07,BUKIT BATOK,EXECUTIVE,289B,BT BATOK ST 25,13 TO 15,144.0,Apartment,1997,...,39,0,1.345245,103.756265,180.074558,Yusof Ishak Secondary School,188,0,1.342334,103.760013
3,86070,2012-04,BISHAN,4 ROOM,232,BISHAN ST 22,01 TO 05,103.0,Model A,1992,...,20,1,1.354789,103.844934,389.515528,Catholic High School,253,1,1.354789,103.844934
4,153632,2017-12,YISHUN,4 ROOM,876,YISHUN ST 81,01 TO 03,83.0,Simplified,1987,...,74,0,1.41628,103.838798,312.025435,Orchid Park Secondary School,208,0,1.414888,103.838335


## 5. Pre-processing

### a. Assemble our predictor variables (X) and our target (y)¶

# !!!!!!!!!!THE ROW BELOW IS A NEW CODE !!!!!!!!!!!!!!!!!!

In [3]:
# Create town_proxy var

# Coordinates of the Singapore River, a reference point for the central of Singapore
town_lat = 1.290160
town_long = 103.852000

# Create a new column for town_proxy
clean['town_proxy'] = ((clean['latitude'] - town_lat) ** 2) + ((clean['longitude'] - town_long) ** 2)

# !!!!!!!!!!PLEASE ADDD ID INTO X VAR !!!!!!!!!!!!!!!!!!

In [4]:
# Create a dataframe, X, containing all the features selected

X = clean[['id','floor_area_sqm',
        'tranc_year',
        'mid_storey',
        'hdb_age',
        'max_floor_lvl',
        'year_completed',
        'total_dwelling_units',
        '4room_sold',
        '5room_sold',
        'exec_sold',
        'latitude',
        'longitude',
        'mall_nearest_distance',
        'mall_within_2km',
        'hawker_within_2km',
        'hawker_nearest_distance',
        'hawker_food_stalls',
        'hawker_market_stalls',
        'mrt_nearest_distance',
        'bus_stop_nearest_distance',
        'pri_sch_nearest_distance',
        'vacancy',
        'sec_sch_nearest_dist',
        'cutoff_point',
        'town_proxy']]

In [5]:
y = clean['resale_price']

### b. Train-test-split

In [6]:
# Train/test split data.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## 6. Modelling

### a. Cross-validation

In [7]:
# Linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [8]:
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.8336578011573608
0.832252940128641


In [9]:
cross_val_score(lr,X_test,y_test).mean()

0.8320191182566898

In [10]:
print("Cross-validation scores: {}".format(cross_val_score(lr, X_train, y_train, cv=5)))
print("Baseline train score (mean): {}".format((cross_val_score(lr, X_train, y_train, cv=5)).mean()))
print("Baseline test score (mean): {}".format((cross_val_score(lr, X_test, y_test, cv=5)).mean()))

Cross-validation scores: [0.82991912 0.83419811 0.83440242 0.83692832 0.83238096]
Baseline train score (mean): 0.8335657884083826
Baseline test score (mean): 0.8320191182566898


### b. Linear Regression (LR)

In [None]:
lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)

In [None]:
print("LR train score: {}".format(lr.score(X_train, y_train)))
print("LR test score: {}".format(lr.score(X_test, y_test)))
print("LR train RMSE: {}".format(metrics.mean_squared_error(lr_train_pred, y_train, squared=False)))
print("LR test RMSE: {}".format(metrics.mean_squared_error(lr_test_pred, y_test, squared=False)))

In [None]:
# Check coefficients
lr.coef_

In [None]:
# Scatter plot showing the relationship between observed and predicted values.
plt.figure(figsize = (7, 7))

sns.scatterplot(x=y_test, y=lr_test_pred,alpha=0.5)
sns.set_theme()

# Line showing perfect predictions
plt.plot([11.5, max(max(y_test),max(lr_test_pred))],
         [12, max(max(y_test),max(lr_test_pred))],
         linestyle = '-',color='red')

plt.title('Linear Regression', fontsize = 15)
plt.xlabel('True Values', fontsize = 10)
plt.ylabel('Predicted Values', fontsize = 10);

## Kaggle

In [11]:
# Load test data
test = pd.read_csv('/Users/eugenia/Desktop/DSI-SG-39/project_2/datasets/test.csv', low_memory=False)

In [12]:
# Prepare data
test.columns = test.columns.str.lower()

# Clean data
# Replace nulls for mall and hawker within 500m, 1km, and 2km with 0 using np.replace
test[['mall_within_500m', 'mall_within_1km', 'mall_within_2km', 'hawker_within_500m', 'hawker_within_1km', 
       'hawker_within_2km']] = test[['mall_within_500m', 'mall_within_1km', 'mall_within_2km', 'hawker_within_500m', 
                                    'hawker_within_1km', 'hawker_within_2km']].replace(np.nan, 0)

# Applying mean imputation as we assume missing values are MCAR (missing completely at random)
test['mall_nearest_distance'] = test['mall_nearest_distance'].replace('na', np.nan)
mean_dist = np.mean(clean['mall_nearest_distance'])
test['mall_nearest_distance'] = test['mall_nearest_distance'].fillna(mean_dist)

In [13]:
# Create interaction var
town_lat = 1.290160
town_long = 103.852000

# Create a new column for town_proxy
test['town_proxy'] = ((test['latitude'] - town_lat) ** 2) + ((test['longitude'] - town_long) ** 2)

test['town_proxy'] = test['town_proxy'].astype(int)

In [14]:
# Load test data into model
X_ktest = test[['id','floor_area_sqm',
        'tranc_year',
        'mid_storey',
        'hdb_age',
        'max_floor_lvl',
        'year_completed',
        'total_dwelling_units',
        '4room_sold',
        '5room_sold',
        'exec_sold',
        'latitude',
        'longitude',
        'mall_nearest_distance',
        'mall_within_2km',
        'hawker_within_2km',
        'hawker_nearest_distance',
        'hawker_food_stalls',
        'hawker_market_stalls',
        'mrt_nearest_distance',
        'bus_stop_nearest_distance',
        'pri_sch_nearest_distance',
        'vacancy',
        'sec_sch_nearest_dist',
        'cutoff_point',
        'town_proxy']]

In [15]:
# Make predictions
y_pred = lr.predict(X_ktest)

In [17]:
submission_df = pd.DataFrame({'Id': test['id'], 'Predicted': y_pred})

submission_df.to_csv('kaggle_39sir.csv',index=False)