In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('input/train.csv')

### train_test_split

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train, valid = train_test_split(data, test_size=0.3, random_state=100)

In [5]:
train.shape, valid.shape

((7000, 20), (3000, 20))

### Prepare data

In [6]:
def fillna_life_square(df, source_df):
    df['LifeSquare'] = df['LifeSquare'].fillna(source_df['LifeSquare'].mean())
    return df

In [7]:
def prepare_square(df, col):
    df.loc[df[col] < 15, col] = 15
    df.loc[df[col] > 300, col] = 300
    return df

In [8]:
def prepare_rooms(df):
    df.loc[df['Rooms'] > 5, 'Rooms'] = 5
    return df

In [9]:
def prepare_df(df, source_df):
    df = fillna_life_square(df, source_df)
    df = prepare_square(df, 'Square')
    df = prepare_square(df, 'LifeSquare')
    df = prepare_rooms(df)
    df['Square_2'] = df['Square']**2
    return df

In [10]:
train = prepare_df(train, train)

In [11]:
valid = prepare_df(valid, train)

### Model

In [12]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price', 'Square_2'],
      dtype='object')

In [13]:
feats = ['Rooms', 'Square', 'LifeSquare', 'Floor']

In [14]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RF

In [15]:
model = RF(n_estimators=100, max_depth=12)

In [16]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [17]:
pred_train = model.predict(train.loc[:, feats])

In [18]:
pred_train.shape

(7000,)

In [19]:
pred_train

array([231135.85822597, 170672.905171  , 237179.870862  , ...,
       241441.31879714, 176888.22405451, 150514.02550495])

In [20]:
pred_valid = model.predict(valid.loc[:, feats])

In [21]:
pred_valid.shape

(3000,)

In [22]:
pred_valid

array([197263.16838776, 220909.79199212, 204267.41043751, ...,
       195028.72491097, 280411.89221908, 138696.09899197])

### Evaluate

In [23]:
from sklearn.metrics import r2_score as r2

In [24]:
r2(train['Price'], pred_train)

0.7548930268923625

In [25]:
r2(valid['Price'], pred_valid)

0.40136594299364137

### Test

In [26]:
test = pd.read_csv('input/test.csv')

In [27]:
test.shape

(5000, 19)

In [28]:
test = prepare_df(df=test, source_df=train)

In [29]:
test.shape

(5000, 20)

In [30]:
test.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Square_2'],
      dtype='object')

In [31]:
test['Price'] = model.predict(test.loc[:, feats])

In [32]:
test.loc[:, ['Id', 'Price']].to_csv('SShirkin_predictions.csv', index=None)