# Basic linreg model

This notebook assumes you've run `initialisation.ipynb` and `augmentation.ipynb` in that order.

In [30]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from utils.transform_scale import transform_scale_df, TARGET_VARIABLE_COLUMN

DATA_PATH = Path("data")

In [31]:
# Load augmented data
train_augmented = pd.read_csv(DATA_PATH / "train-augmented.csv", parse_dates=["month"])
test_augmented = pd.read_csv(DATA_PATH / "test-augmented.csv", parse_dates=["month"])

In [32]:
train_augmented.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,...,mean_age_m,std_age_f,std_age_m,pri_sch_dist,pri_sch,sec_sch_dist,sec_sch,mall_dist,mrt_name,mrt_dist
0,2001-08-01,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,...,36.16763,20.331631,19.999478,0.344087,Loyang Primary School,0.428301,Pasir Ris Crest Secondary School,1.033216,Pasir Ris,1.137522
1,2014-10-01,punggol,5 room,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,...,31.967676,20.103889,19.793305,0.160852,Edgefield Primary School,0.312383,Meridian Secondary School,0.80604,Cove,0.118373
2,2020-09-01,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,...,34.164736,20.311337,19.94782,0.184906,Fernvale Primary School,0.55838,Pei Hwa Secondary School,0.452556,Fernvale,0.481153
3,2000-10-01,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,...,40.577282,21.625967,21.440329,0.304561,Pei Tong Primary School,0.619132,Clementi Town Secondary School,0.456499,Clementi,0.42332
4,2013-01-01,bukit batok,3 room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,...,38.318241,20.497124,20.287059,0.233809,Princess Elizabeth Primary School,0.217911,Bukit Batok Secondary School,0.764172,Bukit Batok,0.77422


## Drop non-useful columns 

Lat/lng are not continuous variables useful in linreg (we have already) derived the useful features such as distance so we can drop latlng.

In [33]:
# Split the train data into train and test
X = train_augmented.drop(columns=TARGET_VARIABLE_COLUMN)
y = train_augmented[TARGET_VARIABLE_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform and scale the data
# See utils/transform_scale.py for details
X_train, X_test = transform_scale_df(X_train, X_test)
X_train.head()

Unnamed: 0,month,lease_commence_date,floor_area_sqm,elevation,median_storey,distance_to_BN,distance_to_IHL,distance_to_CR,distance_to_IEBP,distance_to_market_hawker,...,sec_sch_Xinmin Secondary School,sec_sch_Yio Chu Kang Secondary School,sec_sch_Yishun Secondary School,sec_sch_Yishun Town Secondary School,sec_sch_Yuan Ching Secondary School,sec_sch_Yuhua Secondary School,sec_sch_Yusof Ishak Secondary School,sec_sch_Yuying Secondary School,sec_sch_Zhenghua Secondary School,sec_sch_Zhonghua Secondary School
424394,24202,23700,-0.594695,2.295101,0.669352,-1.011927,-0.715163,0.121163,0.820218,-0.737495,...,0,0,0,0,0,0,0,0,0,0
120565,24006,23832,1.113372,0.07368,-1.182206,-0.655657,0.259262,0.781332,-1.665585,0.410626,...,0,0,0,0,0,0,0,0,0,0
145559,24198,23640,-1.468591,0.496808,-1.182206,-1.520166,-0.994291,1.135453,-0.973454,-0.785118,...,0,0,0,0,0,0,0,0,0,0
132809,24108,23844,-0.952198,0.60259,0.052166,0.497555,0.087701,-0.070879,0.758776,1.643412,...,0,0,0,0,0,0,0,0,0,0
32978,24110,23952,-0.51525,1.025718,-1.182206,1.602925,1.390263,-1.183845,-0.121179,-0.081635,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# Run prediction using linear regression
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [35]:
# Calculate the mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

# Calculate the mean absolute error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred))

# Calculate R-squared
print("R-squared: %.2f" % r2_score(y_test, y_pred))


Mean squared error: 2664866570.40
Mean absolute error: 40229.30
R-squared: 0.84
