# Ideas:

* We can use first differences for macro feature selection. This should work. We can use random forests as a feature selection mechanism.

* An extra macro variable characterizing market conditions is the number of apartments per time unit (assuming their apartment database is representative of the market). This is basic economics.

* Train a classifier that can detect oddly priced apartments (1 mio or 2 mio)

# Preprocessing Apartment Features

This notebook takes raw sberbank datasets and produces clean training set and test set. This notebook covers the following aspects:
* Data type conversion
* Feature cleaning
* Outlier detection
* Removal of obsolete features
* Merging datasets

In [1]:
import multiprocessing
multiprocessing.cpu_count()

4

Load libraries:

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import label_binarize
from datetime import datetime as dt
import matplotlib.pyplot as plt
from IPython.display import clear_output
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

Load raw data:

In [3]:
df_train = pd.read_csv('~/Desktop/sberbank/train.csv')
df_test = pd.read_csv('~/Desktop/sberbank/test.csv')

## Useful functions

Check if columns match

In [4]:
def check_columns(A, B):
    return dict(left_only = set(A.columns) - set(B.columns), right_only = set(B.columns)-set(A.columns))

Define column types

In [5]:
def make_summary_df(df):
    
    """This function takes a pandas dataframe as an input and outputs a summary data frame. This dataframe can
    then be used to fill appropriate data types. """
    
    summary_dict = dict(var_name=[], n_unique=[], sample_val=[], data_type=[])

    for i, col in enumerate(df.columns):

        summary_dict['var_name'].append(col)
        summary_dict['n_unique'].append(len(np.unique(df[col].dropna() )) )
        summary_dict['sample_val'].append( reduce(lambda x,y: x+y, [np.random.choice(df[col], size=10)]) )
        summary_dict['data_type'].append( str(df.dtypes[i].name) )
        
    df_out = pd.DataFrame(summary_dict)
    return df_out

## Change data types, drop ID variables, assign Xy

Assign X,y:

In [6]:
X_train = df_train.drop(['id', 'price_doc'], axis=1).copy()
y_train = df_train.price_doc.copy()
X_test  = df_test.drop('id', axis=1).copy()

check_columns(X_train, X_test)

{'left_only': set(), 'right_only': set()}

In [7]:
#summary_X_train = make_summary_df(X_train)
#summary_X_train.to_csv('summary_X_train.csv', sep=';')

Assign column types (target assignment edited in the csv file):

In [8]:
summary_X_train = pd.read_csv('summary_X_train.csv')

In [9]:
for i, var in enumerate(summary_X_train.var_name.values):
    X_train[var] = X_train[var].astype(summary_X_train.data_type.values[i])
    X_test[var]  = X_test[var].astype(summary_X_train.data_type.values[i])

Exclude variables marked for exclusion:

In [10]:
summary_X_train.var_name[summary_X_train.exclude==1].values

array(['sub_area', 'ID_metro', 'ID_railroad_station_walk',
       'ID_railroad_station_avto', 'ID_big_road1', 'ID_big_road2'], dtype=object)

In [11]:
X_train = X_train.drop(summary_X_train.var_name[summary_X_train.exclude==1].values, axis=1)
X_test = X_test.drop(summary_X_train.var_name[summary_X_train.exclude==1].values, axis=1)

Assign timestamps

In [12]:
X_train.timestamp = pd.to_datetime(X_train.timestamp)
X_test.timestamp  = pd.to_datetime(X_test.timestamp)

## Feature engineering

The following function cleans and replaces a few features in the data, as well as inputes NAs for values that are not logical

In [13]:
def extract_date_variables(input_data, date_in_index=False):

    if date_in_index:
        
        #input_data['weeknr'] = map(lambda x: str(x), input_data.index.week)
        input_data['year'] = map(lambda x: str(x), input_data.index.year)
        input_data['month'] = map(lambda x: str(x), input_data.index.month)

        #input_data['week_year'] = input_data['weeknr'] + '_' + input_data['year']
        input_data['month_year'] = input_data['month'] + '_' + input_data['year']

        return input_data
    
    else:
        output_data = input_data.to_frame(name='timestamp')
        
        #output_data['weeknr'] = map(lambda x: str(x.week), output_data['timestamp'])
        output_data['year'] = map(lambda x: str(x.year), output_data['timestamp'])
        output_data['month'] = map(lambda x: str(x.month), output_data['timestamp'])

        #output_data['week_year'] = output_data['weeknr'] + '_' + output_data['year']
        output_data['month_year'] = output_data['month'] + '_' + output_data['year']

        return output_data

In [14]:
def clean_bank(raw_data):
    
    raw_data_features = raw_data.copy()
    
    fsq = raw_data_features['full_sq'].copy()
    fsq[fsq <8] = np.nan
    fsq[fsq >400] = np.nan
    raw_data_features['full_sq'] = fsq

    lsq = raw_data_features['life_sq'].copy()
    lsq[lsq <8] = np.nan
    lsq[lsq >400] = np.nan
    raw_data_features['life_sq'] = lsq


    flr = raw_data_features['floor']
    mflr = raw_data_features['max_floor']

    flr[flr==0] = np.nan
    mflr[mflr==0] = np.nan

    raw_data_features['floor']=flr
    raw_data_features['max_floor']=mflr

    building_type = pd.cut(mflr, np.append(np.arange(0,30), 200), labels=False).astype('object')
    raw_data_features['build_type_maxfloor'] = building_type

    # make first floor and last floor as separate var
    raw_data_features['first_floor'] = flr==1
    raw_data_features['last_floor'] = (flr==mflr)

    raw_data.loc[raw_data['material']==3, 'material'] = np.nan
    raw_data_features['material'] = raw_data['material']

    byr = raw_data['build_year'].copy()
    byr[(byr<1000) | (byr>2020)]=np.nan 
    raw_data_features['build_year'] = byr

    room_cat = pd.cut(raw_data['num_room'], np.append(np.arange(0,7), 20), labels=False).astype('object')
    raw_data_features['num_room']=room_cat

    ksq = raw_data['kitch_sq'].copy()
    ksq[(ksq>250) | (ksq<2)] = np.nan
    raw_data_features['kitch_sq'] = ksq

    stat = raw_data['state'].copy()
    stat[stat==33] = 3
    raw_data_features['state'] = stat
    
    date_var = extract_date_variables(raw_data_features.timestamp)
    raw_data_features = pd.concat([raw_data_features, date_var.iloc[:,1:]], axis=1)
    
    raw_data_features = raw_data_features.drop('year', axis=1)
    
    m_counts = raw_data_features.groupby(by='month_year').apply(len).to_frame(name='apt_count')
    raw_data_features = raw_data_features.merge(m_counts, left_on='month_year', right_index=True)
    
    return raw_data_features

In [15]:
X_train_clean = clean_bank(X_train)
X_test_clean = clean_bank(X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
check_columns(X_train_clean, X_test_clean)

{'left_only': set(), 'right_only': set()}

## Check for new factor levels for objects

In [25]:
def get_columns_by_dtype(data, d_type):
    """This function returns an array of column names of data of the corresponding dtype. """
    
    dtype_series = pd.Series(map(lambda x: x.name, data.dtypes.values) )    
    return pd.Series(data.columns).iloc[dtype_series[dtype_series==d_type].index].values

In [38]:
def check_new_factor_levels(train, test):
    """This function takes training set dataframe and testset dataframe and compares the levels of categorical
    variables. If there are unseen factor levels, column name and new level(s) in the testset is returned."""
    train_fac = train[get_columns_by_dtype(train, 'object') ]
    test_fac  = test[get_columns_by_dtype(test, 'object') ]
    
    test_minus_train = []
    for col in train_fac.columns:
        test_minus_train.append( set(test_fac[col].dropna().values) - set(train_fac[col].dropna().values) )
        
    return test_minus_train

In [40]:
a = check_new_factor_levels(X_train_clean, X_test_clean)

In [41]:
a

[set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'2016'},
 set(),
 {'10_2015',
  '11_2015',
  '12_2015',
  '1_2016',
  '2_2016',
  '3_2016',
  '4_2016',
  '5_2016',
  '7_2015',
  '8_2015',
  '9_2015'}]

## Save data

In [19]:
X_train_clean.to_csv('AptFeatures_train.csv')
X_test_clean.to_csv('AptFeatures_test.csv')
y_train.to_csv('target_train.csv')

## Manually corrected the monthly quantity in the trainset to na in first two months !!