In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [6]:
idtrain = pd.read_csv('./Project 01/housing_train.csv')
idtest = pd.read_csv("./Project 01/housing_test.csv")

In [7]:
print(idtrain.shape)
print(idtest.shape)

(7536, 16)
(1885, 15)


In [34]:
idtrain.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Brunswick,52 Evans St,3,h,1650000,S,Nelson,5.2,3056,3.0,1.0,2.0,495.0,141.0,1920.0,Moreland
1,Reservoir,85 Radford Rd,5,h,791000,S,Ray,11.2,3073,4.0,3.0,1.0,961.0,,,Darebin
2,Newport,99 Anderson St,3,h,785000,S,RT,8.4,3015,3.0,1.0,1.0,185.0,,,Hobsons Bay
3,Brighton East,4/377 South Rd,2,u,755000,SP,Buxton,10.7,3187,,,,,,,
4,Hawthorn East,3 Jaques St,5,h,2500000,VB,RT,7.5,3123,5.0,3.0,3.0,757.0,240.0,1925.0,Boroondara


In [35]:
 for i in idtrain.columns:
     print(i,idtrain[i].dtype, idtrain[i].isnull().sum())

print("*"*30)
 for i in idtest.columns:
     print(i,idtest[i].dtype, idtest[i].isnull().sum())     

     

Suburb object 0
Address object 0
Rooms int64 0
Type object 0
Price int64 0
Method object 0
SellerG object 0
Distance float64 0
Postcode int64 0
Bedroom2 float64 1559
Bathroom float64 1559
Car float64 1559
Landsize float64 1564
BuildingArea float64 4209
YearBuilt float64 3717
CouncilArea object 1564
******************************
Suburb object 0
Address object 0
Rooms int64 0
Type object 0
Method object 0
SellerG object 0
Distance float64 0
Postcode int64 0
Bedroom2 float64 419
Bathroom float64 419
Car float64 419
Landsize float64 421
BuildingArea float64 1060
YearBuilt float64 943
CouncilArea object 421


In [36]:
## councilarea is of object type and has too many null values
## lets delete this column from both test and train dataset

X_train = idtrain.drop(["CouncilArea"],axis=1)
X_valid = idtest.drop("CouncilArea",axis=1)

In [37]:
object_cols = [col for col in X_train.columns if X_train[col].dtype=='object']
print(set(object_cols))

{'Suburb', 'Method', 'Type', 'SellerG', 'Address'}


In [38]:
good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))]
good_label_cols

['Suburb', 'Type', 'Method']

In [39]:
bad_label_cols = list(set(object_cols) - set(good_label_cols))
bad_label_cols

['Address', 'SellerG']

In [40]:
from sklearn.preprocessing import OrdinalEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply ordinal encoder 
# Your code here

ord_train_data = X_train[good_label_cols].copy()
ord_valid_data = X_valid[good_label_cols].copy()

ordinal_encoder = OrdinalEncoder()
ord_train_data = pd.DataFrame(ordinal_encoder.fit_transform(ord_train_data))
ord_valid_data = pd.DataFrame(ordinal_encoder.transform(ord_valid_data))

ord_train_data.index = X_train.index
ord_valid_data.index = X_valid.index
    
label_X_train.drop(good_label_cols,axis=1,inplace=True)
label_X_valid.drop(good_label_cols,axis=1,inplace=True)

label_X_train = pd.concat([label_X_train,ord_train_data],axis=1)
label_X_valid = pd.concat([label_X_valid,ord_valid_data],axis=1)

    


In [41]:
 for i in label_X_train.columns:
     print(i,label_X_train[i].dtype, label_X_train[i].isnull().sum())

print("*"*30)
 for i in label_X_valid.columns:
     print(i,label_X_valid[i].dtype, label_X_valid[i].isnull().sum())     

     

Rooms int64 0
Price int64 0
Distance float64 0
Postcode int64 0
Bedroom2 float64 1559
Bathroom float64 1559
Car float64 1559
Landsize float64 1564
BuildingArea float64 4209
YearBuilt float64 3717
0 float64 0
1 float64 0
2 float64 0
******************************
Rooms int64 0
Distance float64 0
Postcode int64 0
Bedroom2 float64 419
Bathroom float64 419
Car float64 419
Landsize float64 421
BuildingArea float64 1060
YearBuilt float64 943
0 float64 0
1 float64 0
2 float64 0


In [42]:
X_full = label_X_train.copy()
X_test_full = label_X_valid.copy()

In [45]:
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

In [47]:
cols_with_missing_values = [col for col in X.columns if X[col].isnull().any()]
print(cols_with_missing_values)

cols_with_all = [ col for col in X.columns if not  X[col].isnull().any()]
cols_with_all

['Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt']


['Rooms', 'Price', 'Distance', 'Postcode', 0, 1, 2]

In [48]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


def nan_predictor_train(col):
    cols_with_all = [col for col in X.columns if not X[col].isnull().any()]
    cols_used = cols_with_all[:]
    cols_used.append(col)
    data = X[cols_used]
    print(f'before deleting nan values: {data.shape}')
    data_train = data.dropna(axis=0)
    print(f'after deleting nan values: {data_train.shape}')
    target_train = data_train[col]
    features_train = data_train.drop([col],axis=1)
    
    rf_model = RandomForestRegressor(n_estimators=500, random_state=0)
    rf_model.fit(features_train, target_train)
    gmb_model = GradientBoostingRegressor(n_estimators=500, random_state=0)
    gmb_model.fit(features_train, target_train)
    
    features_test = data[cols_with_all]
    pred1 = rf_model.predict(features_test)
    pred2 = gmb_model.predict(features_test)
    pred = (pred1 + pred2) / 2
    X[col] = pred


In [49]:
for col in cols_with_missing_value:
    nan_predictor_train(col)

before deleting nan values: (7536, 8)
after deleting nan values: (5977, 8)
before deleting nan values: (7536, 9)
after deleting nan values: (5977, 9)
before deleting nan values: (7536, 10)
after deleting nan values: (5977, 10)
before deleting nan values: (7536, 11)
after deleting nan values: (5972, 11)
before deleting nan values: (7536, 12)
after deleting nan values: (3327, 12)
before deleting nan values: (7536, 13)
after deleting nan values: (3819, 13)


In [50]:
cols_with_nan = [col for col in X_test.columns if X_test[col].isnull().any()]
print(cols_with_nan)
cols_with_all = [col for col in X_test.columns if not X_test[col].isnull().any()]
print(cols_with_all)

['Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt']
['Rooms', 'Distance', 'Postcode', 0, 1, 2]


In [51]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


def nan_predictor_test(col):
    cols_with_all = [col for col in X_test.columns if not X_test[col].isnull().any()]
    cols_used = cols_with_all[:]
    cols_used.append(col)
    data = X_test[cols_used]
    print(f'before deleting nan values: {data.shape}')
    data_train = data.dropna(axis=0)
    print(f'after deleting nan values: {data_train.shape}')
    target_train = data_train[col]
    features_train = data_train.drop([col],axis=1)
    
    rf_model = RandomForestRegressor(n_estimators=500, random_state=0)
    rf_model.fit(features_train, target_train)
    gmb_model = GradientBoostingRegressor(n_estimators=500, random_state=0)
    gmb_model.fit(features_train, target_train)
    
    features_test = data[cols_with_all]
    pred1 = rf_model.predict(features_test)
    pred2 = gmb_model.predict(features_test)
    pred = (pred1 + pred2) / 2
    X_test[col] = pred


In [52]:
for col in ['Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt']:
    nan_predictor_test(col)

before deleting nan values: (1885, 7)
after deleting nan values: (1466, 7)
before deleting nan values: (1885, 8)
after deleting nan values: (1466, 8)
before deleting nan values: (1885, 9)
after deleting nan values: (1466, 9)
before deleting nan values: (1885, 10)
after deleting nan values: (1464, 10)
before deleting nan values: (1885, 11)
after deleting nan values: (825, 11)
before deleting nan values: (1885, 12)
after deleting nan values: (942, 12)


In [62]:
print(X.shape)
X.isnull().sum()

(7536, 13)


Rooms           0
Price           0
Distance        0
Postcode        0
Bedroom2        0
Bathroom        0
Car             0
Landsize        0
BuildingArea    0
YearBuilt       0
0               0
1               0
2               0
dtype: int64

In [61]:
print(X_test.shape)
X_test.isnull().sum()

(1885, 12)


Rooms           0
Distance        0
Postcode        0
Bedroom2        0
Bathroom        0
Car             0
Landsize        0
BuildingArea    0
YearBuilt       0
0               0
1               0
2               0
dtype: int64

In [63]:
## So now no null values
## lets go on model building

In [64]:
train_target = X.Price
train_features = X.drop(['Price'], axis=1)

In [68]:
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_target, train_size=0.8, test_size=0.2,random_state=0)

In [69]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

In [70]:
def final_pred(n):
    rf_model = RandomForestRegressor(random_state=1, n_estimators=n)
    rf_model.fit(X_train,y_train)
    rf_pred = rf_model.predict(X_valid)

    gbm_model = GradientBoostingRegressor(random_state=1, n_estimators=n)
    gbm_model.fit(X_train,y_train)
    gbm_pred = gbm_model.predict(X_valid)

    pred = (rf_pred + gbm_pred)/2
    return pred

In [71]:
for n in [500]:
    pred = final_pred(n)
    print(212467/(np.sqrt(mean_squared_error(y_valid,pred))))

0.9715801612572973


In [72]:
rf_model = RandomForestRegressor(random_state=1, n_estimators=700)
rf_model.fit(X_train,y_train)
rf_pred = rf_model.predict(X_test)

gbm_model = GradientBoostingRegressor(random_state=1, n_estimators=700)
gbm_model.fit(X_train,y_train)
gbm_pred = gbm_model.predict(X_test)

pred = (rf_pred + gbm_pred)/2

In [74]:
len(pred)

1885

In [75]:
len(idtest)

1885

In [77]:
pred = pd.DataFrame(pred)
pred.to_csv('Pooja_bharadiya_P1_part2.csv', index=False)