In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn import preprocessing 
import torch.utils.data as utils
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import torchvision

# Loading Files and droping ID column from train

In [2]:
train_dataset = pd.read_csv('./dataset/kaggle/train.csv',sep=',')

test_dataset = pd.read_csv('./dataset/kaggle/test.csv',sep=',')
test_dataset = test_dataset.drop('Id',axis=1)
label = train_dataset.SalePrice
train_dataset = train_dataset.drop(['SalePrice','Id'],axis=1)

In [3]:
train_dataset.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# Replace Categorical value with NA

In [4]:
def changeNanCategorical(dataset):
    categorical = ['Alley','FireplaceQu','Fence','MiscFeature','PoolQC','GarageQual','GarageFinish','GarageType','GarageCond','BsmtQual','BsmtCond','BsmtFinType1','BsmtFinType2','BsmtExposure']
    for col in categorical:
        dataset[col].fillna('NA', inplace=True)
    return dataset


# Replace Numerical value with the most frequent value.

In [5]:
def changeNanNumerical(dataset,y):
    for i in range(y):
        #print(test_dataset.isna().sum().sort_values()[-18:-1].to_frame().index[i])
        col = dataset.isna().sum().sort_values()[-(y+2):-1].to_frame().index[i]
        dataset[col].fillna(dataset[col].value_counts().to_frame().index[0],inplace=True)
    return dataset

# TestData replacing values

In [6]:
test_dataset = changeNanCategorical(test_dataset)
test_dataset = changeNanNumerical(test_dataset,78)
test_dataset.LotFrontage.fillna(test_dataset.LotFrontage.value_counts().to_frame().index[0],inplace=True)

# TrainData replacing values

In [7]:
train_dataset = changeNanCategorical(train_dataset)
train_dataset = changeNanNumerical(train_dataset,78)
train_dataset.LotFrontage.fillna(train_dataset.LotFrontage.value_counts().to_frame().index[0],inplace=True)

# Get categorical and numerical column

In [8]:
def getCategoricalColumn(dataset):
    column = []
    for col in dataset.columns:
        if isinstance(dataset[col].unique()[0], str):
            column.append(col)
    return column
categoricalColum = getCategoricalColumn(train_dataset)
num_cols = train_dataset._get_numeric_data().columns

# Create Dummy dataframe 

In [9]:
X_train_num = pd.DataFrame()
X_train_Cat = pd.DataFrame()
X_test_num = pd.DataFrame()
X_test_Cat = pd.DataFrame()

# Add Numerical Column

In [10]:
for i in num_cols:
    X_train_num[i] = train_dataset[i]
    X_test_num[i] = test_dataset[i]

# Normalise the Numerical data

In [11]:
X_train_num = (X_train_num-X_train_num.mean())/X_train_num.std()
X_test_num = (X_test_num-X_test_num.mean())/X_test_num.std()

In [12]:
# Add Categorical Column

In [13]:
for i in categoricalColum:
    X_train_Cat[i] = train_dataset[i]
    X_test_Cat[i] = test_dataset[i]

# One Hot Encoding on Test and Train Data

In [14]:
def changeStrColInOneHotEncoding(dataset):
    for col in dataset.columns:
        if isinstance(dataset[col].unique()[0], str):
            # Get one hot encoding of columns B
            one_hot = pd.get_dummies(dataset[col],prefix=str(col+"_"))
            # Drop column B as it is now encoded
            dataset = dataset.drop(col,axis = 1)
            # Join the encoded df
            dataset = dataset.join(one_hot)
    return dataset


X_train_Cat = changeStrColInOneHotEncoding(X_train_Cat)
X_test_Cat = changeStrColInOneHotEncoding(X_test_Cat)

# Concat Categorical and numerical column

In [15]:
X_train = pd.concat([X_train_Cat, X_train_num], axis=1)
X_test = pd.concat([X_test_Cat, X_test_num], axis=1)

# Add column in Test data which are not present from Train

In [16]:
for i in  X_train.columns:
    if i not in X_test.columns:
        ind = X_train.columns.get_loc(i)
        X_test.insert(ind,i,0)

In [17]:
X_train.isna().sum().any()

False

In [18]:
X_train = X_train.values
X_test = X_test.values
label = label.values.astype(np.float)

In [19]:
X_train.shape

(1460, 302)

# Sklearn MLP

In [20]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(100,50,10), activation='relu', solver='adam',early_stopping=True, max_iter=1000,validation_fraction=0.1)
mlp.fit(X_train ,label)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=True, epsilon=1e-08,
             hidden_layer_sizes=(100, 50, 10), learning_rate='constant',
             learning_rate_init=0.001, max_iter=1000, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

# Predict

In [21]:
predict_test = mlp.predict(X_test)

In [24]:
predict_test.max()

651529.3671149921

# Save in CSV

In [23]:
test_df = pd.DataFrame(columns=['Id', 'SalePrice'])

test_dataset1 = pd.read_csv('./dataset/kaggle/test.csv',sep=',')
test_df['Id'] = test_dataset1.Id

test_df['SalePrice'] = predict_test

test_df.to_csv("submission.csv",index=False)