In [90]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import lightgbm as lgb
from math import sin, cos, sqrt, atan2, radians

def distance_lat_lon(lat1, lon1, lat2, lon2):

    # approximate radius of earth in km
    R = 6373.0

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c   
    return distance


print("Read train and test data...")
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")

img_time = pd.read_csv("../input/listing_image_time.csv")
img_time.columns = ['listing_id', 'time_stamp']
train_df = pd.merge(train_df, img_time,  how='left', on=['listing_id'])
test_df = pd.merge(test_df, img_time,  how='left', on=['listing_id'])

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=1800):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.021
    param['max_depth'] = 6
    #param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=30)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
    
    print("Predict test...")
    print(test_X.shape)
    print(model)
    #print(test_X.head())
    pred_test_y = model.predict(test_X)
    #pred_test_y = model.predict(xgtest)
    return pred_test_y, model

def runLGBM(train_X, train_y, test_X, test_y=None, feature_names=None, num_rounds=1800):
    params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_classes': 3,
    'metric': {'multi_logloss'},
    'num_leaves': 55,
    'learning_rate': 0.01,
    'feature_fraction': 0.82,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
    }
    
    num_rounds = num_rounds

    #plst = list(param.items())
    #train_data = lgb.Dataset(X_train, label=y_train)
    #val_data = lgb.Dataset(X_val[features], y_val)
    
    train_data = lgb.Dataset(train_X, label=train_y)
    '''
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
    '''
    print("Predict cv...")
    bstcv = lgb.cv(params, train_data, num_rounds, verbose_eval = 100, early_stopping_rounds = 30)
    #print(test_X.shape)
    #print(model)
    #print(test_X.head())
    #pred_test_y = model.predict(test_X)
    #pred_test_y = model.predict(xgtest)
    return bstcv #pred_test_y, model

def add_median_price(key=None, suffix="", trn_df=None, tst_df=None):
    """
    Compute median prices for renthop dataset.
    The function adds 2 columns to the pandas DataFrames : the median prices and a ratio
    between nthe actual price of the rent and the median
    
    :param key: list of columns on which to groupby and compute median prices
    :param suffix: string used to suffix the newly created columns/features
    :param trn_df: training dataset as a pandas DataFrame
    :param tst_df: test dataset as a pandas DataFrame
    :return: updated train and test DataFrames

    :Example
    
    train, test = add_median_price(key=['bedrooms', 'bathrooms'], 
                                   suffix='rooms', 
                                   trn_df=train, 
                                   tst_df=test)

    """
    # Set features to be used
    median_features = key[:]
    median_features.append('price')
    # Concat train and test to find median prices over whole dataset
    median_prices = pd.concat([trn_df[median_features], tst_df[median_features]], axis=0)
    # Group data by key to compute median prices
    medians_by_key = median_prices.groupby(by=key)['price'].median().reset_index()
    # Rename median column with provided suffix
    medians_by_key.rename(columns={'price': 'median_price_' + suffix}, inplace=True)
    # Update data frames, note that merge seems to reset the index
    # that's why I reset first and set again the index
    #trn_df = trn_df.reset_index().merge(medians_by_key, on=key, how='left').set_index('listing_id')
    #tst_df = tst_df.reset_index().merge(medians_by_key, on=key, how='left').set_index('listing_id')
    
    trn_df = pd.merge(trn_df, medians_by_key, how = 'left', on = key)
    tst_df = pd.merge(tst_df, medians_by_key, how = 'left', on = key)
    
    trn_df['price_to_median_ratio_' + suffix] = trn_df['price'] /trn_df['median_price_' + suffix]
    tst_df['price_to_median_ratio_' + suffix] = tst_df['price'] / tst_df['median_price_' + suffix]

    return trn_df, tst_df

Read train and test data...


In [91]:
import gc
gc.collect()

56

In [92]:
# feature engineering
print("Feature engineering...")
test_df["bathrooms"].loc[19671] = 1.5
test_df["bathrooms"].loc[22977] = 2.0
test_df["bathrooms"].loc[63719] = 2.0
train_df["price"] = train_df["price"].clip(upper=13000)

train_df["logprice"] = np.log(train_df["price"])
test_df["logprice"] = np.log(test_df["price"])


train_df['half_bathrooms'] = train_df["bathrooms"] - train_df["bathrooms"].apply(int)#.astype(int) # Half bathrooms? 1.5, 2.5, 3.5...
test_df['half_bathrooms'] = test_df["bathrooms"] - test_df["bathrooms"].apply(int)#.astype(int) # Half bathrooms? 1.5, 2.5, 3.5...

train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

train_df['price_per_room'] = train_df['price']/train_df['room_sum']
test_df['price_per_room'] = test_df['price']/test_df['room_sum']

train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

train_df["created_weekday"] = train_df["created"].dt.weekday
test_df["created_weekday"] = test_df["created"].dt.weekday
train_df["created_week"] = train_df["created"].dt.week
test_df["created_week"] = test_df["created"].dt.week

'''
# median price by group
median_features = ['bedrooms', 'price']
median_prices = pd.concat([train_df[median_features], test_df[median_features]], axis=0)
medians_by_key = median_prices.groupby(by=['bedrooms'])['price'].median().reset_index()
medians_by_key.rename(columns={'price': 'median_price_bed'}, inplace=True)
train_df = pd.merge(train_df, medians_by_key, how = 'left', on = 'bedrooms')
test_df = pd.merge(test_df, medians_by_key, how = 'left', on = 'bedrooms')
train_df['price_to_median_ratio_bed'] = train_df['price'] /train_df['median_price_bed']
test_df['price_to_median_ratio_bed'] = test_df['price'] /test_df['median_price_bed']
'''

train_df, test_df = add_median_price(key=['bedrooms'], suffix="bed", trn_df=train_df, tst_df=test_df)
train_df, test_df = add_median_price(key=['building_id'], suffix="building", trn_df=train_df, tst_df=test_df)
train_df, test_df = add_median_price(key=['manager_id'], suffix="manager", trn_df=train_df, tst_df=test_df)

#train_df, test_df = add_median_price(key=['building_id', 'manager_id'], suffix="bld_mg", trn_df=train_df, tst_df=test_df)
train_df, test_df = add_median_price(key=['bedrooms', 'manager_id'], suffix="bed_mg", trn_df=train_df, tst_df=test_df)
train_df, test_df = add_median_price(key=['bedrooms', 'building_id'], suffix="bed_bld", trn_df=train_df, tst_df=test_df)

def distance_lat_lon_cols(_df, ny_lat, ny_lon, _lat, _lon):
    dist = []
    for lat, lon in zip(_df[_lat].values, _df[_lon].values):
        distance = distance_lat_lon(ny_lat, ny_lon, lat, lon)
        dist.append(distance)
    return dist

# New York City Center Coords
ny_lat = 40.785091
ny_lon = -73.968285

train_df["distance"] = distance_lat_lon_cols(train_df, ny_lat, ny_lon, 'latitude', 'longitude')
test_df["distance"] = distance_lat_lon_cols(test_df, ny_lat, ny_lon, 'latitude', 'longitude')

train_df["total_days"] =(train_df["created_month"] -4.0)*30 + train_df["created_day"] +  train_df["created_hour"] /25.0
test_df["total_days"] =(test_df["created_month"] -4.0)*30 + test_df["created_day"] +  test_df["created_hour"] /25.0        
train_df["diff_rank"]= train_df["total_days"]/train_df["listing_id"]
test_df["diff_rank"]= test_df["total_days"]/test_df["listing_id"]


# Add column with bool values to check if keyword is contained or not
def containColumn(_df, _col, _str):
    string_name = _str.lower().replace(' ', '_')
    _df[_col+'_'+string_name] = _df[_col].apply(lambda x: _str.lower() in x.lower())
    return _df

# Add column with value count
def valueCountColumn(_df, _col):
    _dict = dict([(i, a) for i, a in zip(_df[_col].value_counts().index, _df[_col].value_counts().values)])
    _df[_col+'ValueCount'] = _df[_col].apply(lambda x: _dict[x])
    return _df

# group by country and campaign id
def countGroupBy(_df, str1, str2):
    new_col = str1 + "_" + str2
    _df[new_col] = _df[[str1, str2]].apply(lambda x : '{}-{}'.format(x[0],x[1]), axis=1)
    _df = valueCountColumn(_df, new_col)
    _df.drop(new_col, axis=1, inplace=True)
    print("New value count column is added:", new_col)
    return _df


import string
string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def feature_engineering(df):
    #df['md'] = df.created_month.astype(str) + '_' + df.created_day.astype(str)
    #df['mdh'] = df.md.astype(str) + '_' + df.created_hour.astype(str)
    
    # room combination type
    df['room_combi'] = df.bedrooms.astype(str) + '_' + df.bathrooms.astype(str)
    
    '''
    # address
    df['address1'] = df['display_address']
    df['address1'] = df['address1'].apply(lambda x: x.lower())

    address_map = {
        'w': 'west',
        'st.': 'street',
        'ave': 'avenue',
        'st': 'street',
        'e': 'east',
        'n': 'north',
        's': 'south'
    }
    

    def address_map_func(s):
        s = s.split(' ')
        out = []
        for x in s:
            if x in address_map:
                out.append(address_map[x])
            else:
                out.append(x)
        return ' '.join(out)

    df['address1'] = df['address1'].apply(lambda x: x.translate(remove_punct_map))
    df['address1'] = df['address1'].apply(lambda x: address_map_func(x))

    new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']

    for col in new_cols:
        df[col] = df['address1'].apply(lambda x: 1 if col in x else 0)

    df['other_address'] = df[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
    
    df.drop('address1', axis = 1, inplace = True)
    '''
    
    return df


train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

#train_df['room_combi'] = train_df.bedrooms.astype(str) + '_' + train_df.bathrooms.astype(str)
#test_df['room_combi'] = test_df.bedrooms.astype(str) + '_' + test_df.bathrooms.astype(str)

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))


import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho

def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi

def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)

def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)

def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

train_df, test_df = operate_on_coordinates(train_df, test_df)

import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
    df['num_cap_share'] = df['description'].apply(cap_share)
    
    # how long in lines the desc is?
    df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
   
    # is the description redacted by the website?        
    df['num_redacted'] = 0
    df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

    
    # can we contact someone via e-mail to ask for the details?
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    
    #and... can we call them?
    
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
    def try_and_find_nr(description):
        if reg.match(description) is None:
            return 0
        return 1

    df['num_phone_nr'] = df['description'].apply(try_and_find_nr)

'''    
#sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

def description_sentiment(sentences):
    analyzer = SentimentIntensityAnalyzer()
    result = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        result.append(vs)
    return pd.DataFrame(result).mean()

train_df['description_tokens'] = train_df['description'].apply(sent_tokenize)
train_df = pd.concat([train_df, train_df['description_tokens'].apply(description_sentiment)],axis=1)

test_df['description_tokens'] = test_df['description'].apply(sent_tokenize)
test_df = pd.concat([train_df, test_df['description_tokens'].apply(description_sentiment)],axis=1)
'''

#missing values
train_df = train_df.fillna(-999).replace(np.inf, -999)
test_df = test_df.fillna(-999).replace(np.inf, -999)

features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density", 
                 "half_bathrooms", "num_photos", "num_features", "num_description_words","listing_id", "created_year", 
                 "created_month", "created_day", "created_hour", "created_week", "created_weekday", "time_stamp",
                 "distance", "total_days", "diff_rank", 
                 "price_to_median_ratio_bed", "median_price_bed",
                 "price_to_median_ratio_building", "median_price_building",
                 "price_to_median_ratio_manager", "median_price_manager",
                 #"price_to_median_ratio_bld_mg", "median_price_bld_mg",
                 "price_to_median_ratio_bed_mg", "median_price_bed_mg",
                 "price_to_median_ratio_bed_bld", "median_price_bed_bld",
                 "num_rho", "num_phi", "num_rot15_X", "num_rot15_Y", "num_rot30_X", "num_rot30_Y", "num_rot45_X",
                 "num_rot45_Y", "num_rot60_X", "num_rot60_Y", 'num_cap_share', 'num_nr_of_lines',
                 'num_redacted', 'num_email', 'num_phone_nr',
                 #'street', 'avenue', 'east', 'west', 'north', 'south', 'other_address'
                 #"description_no_fee", "description_low_fee", "description_brand_new", 
                 #"description_location", "description_luxury"
                ]

print(train_df[features_to_use].head())

index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c

a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

print("Convert categorical to numeric...")
categorical = ["street_address", "display_address", "manager_id", "building_id", "room_combi"#, "description_tokens"
              #,"w_building_id", "w_manager_id", "w_display_address", "w_street_address", 
              #,'building_id_price_roundValueCount'
              ]

print(categorical)
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

print("tfidf for features")
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

tfidf_train_df = pd.DataFrame(tr_sparse.toarray())
tfidf_train_df.columns = ['tf' + str(i) for i in range(200)]
tfidf_test_df = pd.DataFrame(te_sparse.toarray())
tfidf_test_df.columns = ['tf' + str(i) for i in range(200)]

train_X = pd.concat([train_df[features_to_use], tfidf_train_df], axis = 1)
test_X = pd.concat([test_df[features_to_use], tfidf_test_df], axis = 1)

#train_df['description'] =  train_df['description'].apply(lambda x: str(x) if len(x)>2 else "nulldesc") 
#test_df['description'] = test_df['description'].apply(lambda x: str(x) if len(x)>2 else "nulldesc") 

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

print("Feature engineering is complete!")

Feature engineering...
   bathrooms  bedrooms  latitude  longitude  price  price_t  price_per_room  \
0        1.5         3   40.7145   -73.9425   3000   1000.0      666.666667   
1        1.0         2   40.7947   -73.9667   5465   2732.5     1821.666667   
2        1.0         1   40.7388   -74.0018   2850   2850.0     1425.000000   
3        1.0         1   40.7539   -73.9677   3275   3275.0     1637.500000   
4        1.0         4   40.8241   -73.9493   3350    837.5      670.000000   

   logprice  density  half_bathrooms      ...       num_rot30_Y  num_rot45_X  \
0  8.006368        5             0.5      ...        -84.393333   -23.495744   
1  8.606119       62             0.0      ...        -84.454391   -23.456146   
2  7.955074       92             0.0      ...        -84.456839   -23.520493   
3  8.094073      144             0.0      ...        -84.434857   -23.485703   
4  8.116716        5             0.0      ...        -84.454022   -23.423054   

   num_rot45_Y  num_r

In [37]:
import nltk
#nltk.download()
nltk.download('punkt')
nltk.download('book')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hyunor\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [41]:
train_df[["price_to_median_ratio_bed", "median_price_bed"]].head()

Unnamed: 0,price_to_median_ratio_bed,median_price_bed
0,0.666667,4500.0
1,1.631343,3350.0
2,0.982759,2900.0
3,1.12931,2900.0
4,0.567797,5900.0


In [95]:
median_features = ['bedrooms', 'price']
median_prices = pd.concat([train_df[median_features], test_df[median_features]], axis=0)

In [98]:
median_prices.groupby(by=['bedrooms'])['price'].median().reset_index()

Unnamed: 0,bedrooms,price
0,0,2400.0
1,1,2900.0
2,2,3350.0
3,3,4500.0
4,4,5900.0
5,5,8109.0
6,6,8000.0
7,7,12000.0
8,8,8247.5


In [39]:
print(train_df.latitude.median())
print(train_df.longitude.median())

40.7518
-73.9779


In [46]:
lat_median = train_df.latitude.median()
lon_median = train_df.longitude.median()

# New York City Center Coords
ny_lat = 40.785091
ny_lon = -73.968285

dist = []
for lat, lon in zip(train_df.latitude.values, train_df.longitude.values):
    distance = distance_lat_lon(ny_lat, ny_lon, lat, lon)
    dist.append(distance)

In [45]:
len(dist)

49352

In [150]:
len(train_df.price.round(-2).value_counts())

125

In [143]:
print(train_X.shape)
print(test_X.shape)

rain_df[train_df.interest_level == 'high']['description'].head()

(49352, 233)
(74659, 233)


In [137]:
#train_df['description'] = train_df["description"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
#test_df['description'] = test_df["description"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
'''
tfidf = CountVectorizer(stop_words='english', max_features=100)
tr_sparse = tfidf.fit_transform(train_df["description"])
te_sparse = tfidf.transform(test_df["description"])

tfidf_train_df = pd.DataFrame(tr_sparse.toarray())
tfidf_train_df.columns = ['tf_des_' + str(i) for i in range(100)]
tfidf_test_df = pd.DataFrame(te_sparse.toarray())
tfidf_test_df.columns = ['tf_des_' + str(i) for i in range(100)]

train_X = pd.concat([train_X, tfidf_train_df], axis = 1)
test_X = pd.concat([test_X, tfidf_test_df], axis = 1)
'''

In [93]:
#preds, model = runXGB(train_X, train_y, test_X, num_rounds=1800)
cv_results = runLGBM(train_X, train_y, test_X, num_rounds=5000)

Predict cv...
[100]	cv_agg's multi_logloss:0.725577+0.00174373
[200]	cv_agg's multi_logloss:0.613911+0.00261356
[300]	cv_agg's multi_logloss:0.570307+0.00297516
[400]	cv_agg's multi_logloss:0.5491+0.00340758
[500]	cv_agg's multi_logloss:0.53755+0.00358496
[600]	cv_agg's multi_logloss:0.530231+0.00376081
[700]	cv_agg's multi_logloss:0.525305+0.00387723
[800]	cv_agg's multi_logloss:0.522023+0.00403883
[900]	cv_agg's multi_logloss:0.519468+0.00421874
[1000]	cv_agg's multi_logloss:0.517669+0.00428064
[1100]	cv_agg's multi_logloss:0.516376+0.00435523
[1200]	cv_agg's multi_logloss:0.515315+0.00448393
[1300]	cv_agg's multi_logloss:0.514588+0.00457409
[1400]	cv_agg's multi_logloss:0.514086+0.00461831
[1500]	cv_agg's multi_logloss:0.513741+0.0046771
[1600]	cv_agg's multi_logloss:0.513461+0.0048196
[1700]	cv_agg's multi_logloss:0.51327+0.00488071


In [94]:
print(pd.DataFrame(cv_results).tail(1))

best_n_estimators = len(cv_results['multi_logloss-mean'])
best_cv_score = cv_results['multi_logloss-mean'][-1]

# 0.51672
# 0.516932    
# 0.514822  rotations...
# 0.513953  additional desc, email, etc
# 0.513162  more median and street features

      multi_logloss-mean  multi_logloss-stdv
1690            0.513259            0.004866


In [77]:
all_features = train_X.columns.values

new_features = [
                #'street', 'avenue', 'east', 'west', 'north', 'south', 'other_address',
                # "price_to_median_ratio_bed", "median_price_bed",
                # "price_to_median_ratio_building", "median_price_building",
                # "price_to_median_ratio_manager", "median_price_manager"
    
                 "price_to_median_ratio_bld_mg", "median_price_bld_mg",
                 "price_to_median_ratio_bed_mg", "median_price_bed_mg",
                 "price_to_median_ratio_bed_bld", "median_price_bed_bld"
               ]

features = list(set(all_features) - set(new_features))

In [78]:
from sklearn.model_selection import train_test_split

params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_classes': 3,
'metric': {'multi_logloss'},
'num_leaves': 55,
'learning_rate': 0.01,
'feature_fraction': 0.82,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}

X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.25, random_state=0)


In [79]:
isBenchmark = True

for col in new_features:
    if isBenchmark:
        train_data = lgb.Dataset(X_train[features], label=y_train)
        val_data = lgb.Dataset(X_val[features], y_val)
        bst = lgb.train(params, train_data, 10000, valid_sets=val_data, 
                        verbose_eval = 10000, early_stopping_rounds=30)
        pred = bst.predict(X_val[features])
        score = log_loss(y_val, pred)  
        benchmark = score
        isBenchmark = False
    
    features.append(col)
    train_data = lgb.Dataset(X_train[features], label=y_train)
    val_data = lgb.Dataset(X_val[features], y_val)
    bst = lgb.train(params, train_data, 10000, valid_sets=val_data, 
                verbose_eval = 10000, early_stopping_rounds=30)
    pred = bst.predict(X_val[features])
    score = log_loss(y_val, pred) 
            
    print(col, benchmark, score, benchmark - score)
    
    features = list(set(features) - set([col]))

#print(log_loss(y_val, pred))

# t size 0.25, 0.507583
# 0.505001 new features

Train until valid scores didn't improve in 30 rounds.
Early stopping, best iteration is:
[1468]	valid_0's multi_logloss:0.506314
Train until valid scores didn't improve in 30 rounds.
Early stopping, best iteration is:
[1540]	valid_0's multi_logloss:0.506315
('price_to_median_ratio_bld_mg', 0.50642001323308394, 0.50646521319648663, -4.5199963402686727e-05)
Train until valid scores didn't improve in 30 rounds.
Early stopping, best iteration is:
[1366]	valid_0's multi_logloss:0.506628
('median_price_bld_mg', 0.50642001323308394, 0.50671680607703051, -0.00029679284394656946)
Train until valid scores didn't improve in 30 rounds.
Early stopping, best iteration is:
[1583]	valid_0's multi_logloss:0.506104
('price_to_median_ratio_bed_mg', 0.50642001323308394, 0.50614914895238061, 0.00027086428070333124)
Train until valid scores didn't improve in 30 rounds.
Early stopping, best iteration is:
[1472]	valid_0's multi_logloss:0.506356
('median_price_bed_mg', 0.50642001323308394, 0.50634366001677578,

In [71]:
t_features

['created_day',
 'tf41',
 'logprice',
 'tf74',
 'tf55',
 'tf54',
 'tf57',
 'tf56',
 'tf51',
 'tf50',
 'tf53',
 'density',
 'tf59',
 'tf58',
 'created_hour',
 'num_phi',
 u'latitude',
 'price_per_room',
 u'building_id',
 'tf62',
 'tf76',
 u'bedrooms',
 'manager_level_high',
 'tf199',
 'num_description_words',
 'tf118',
 'tf191',
 'tf190',
 'tf193',
 'tf192',
 'tf195',
 'tf194',
 'tf197',
 'tf196',
 'tf20',
 'tf21',
 'tf22',
 'tf23',
 'tf24',
 'tf25',
 'tf26',
 'tf27',
 'num_cap_share',
 'tf29',
 'tf72',
 'created_year',
 'manager_level_low',
 'tf166',
 'tf167',
 'tf160',
 'tf161',
 'tf162',
 'tf163',
 'tf70',
 'tf168',
 'half_bathrooms',
 u'street_address',
 'tf188',
 'tf189',
 'tf4',
 'tf182',
 'tf183',
 'tf180',
 'tf181',
 'tf186',
 'tf187',
 'num_rot15_X',
 'num_rot15_Y',
 'tf33',
 'tf32',
 'tf31',
 'tf30',
 'num_rot30_Y',
 'price_t',
 'tf35',
 'tf34',
 'tf39',
 'tf38',
 'room_combi',
 'tf65',
 'created_weekday',
 'tf110',
 'tf113',
 'tf112',
 'tf115',
 'tf114',
 'tf117',
 'tf116',
 

In [95]:
train_data = lgb.Dataset(train_X, label=train_y)

bst = lgb.train(params, train_data, best_n_estimators)
pred = bst.predict(test_X)
out_df = pd.DataFrame(pred)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("lgb_04_25_v2_cv0.5135.csv", index=False)
print(out_df.head())

# cv:0.531324 lb:0.53275
# cv:0.518967 lb:0.51759
# cv:0.517992 room_combi

       high    medium       low  listing_id
0  0.088130  0.729487  0.182383     7142618
1  0.015809  0.026031  0.958160     7210040
2  0.027295  0.412797  0.559909     7103890
3  0.175998  0.695555  0.128447     7143442
4  0.034727  0.364566  0.600706     6860601


In [None]:
pd.read_csv()

In [15]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

etc = ExtraTreesClassifier(n_estimators = 1000, max_features= 50, criterion= 'gini', min_samples_split= 4,
                                    max_depth= 20, min_samples_leaf= 2, n_jobs = -1, verbose = 1, random_state = 5)

In [6]:
pd.DataFrame(train_X).isnull().values.any()

True

In [8]:
train_X = train_X.fillna(-1).replace(np.inf, -1)
test_X = test_X.fillna(-1).replace(np.inf, -1)

In [16]:
etc.fit(train_X, train_y)
et_preds = etc.predict_proba(test_X)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.1min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    3.4s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    4.2s finished


In [17]:
out_df = pd.DataFrame(et_preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("cv_et_n1000.csv", index=False)
print(out_df.head())

       high    medium       low  listing_id
0  0.095636  0.482819  0.421545     7142618
1  0.007983  0.041947  0.950070     7210040
2  0.018121  0.157359  0.824520     7103890
3  0.238815  0.480919  0.280266     7143442
4  0.051466  0.295523  0.653011     6860601


In [27]:
from sklearn import cross_validation

etc = ExtraTreesClassifier(n_estimators = 100, max_features= 50, criterion= 'gini', min_samples_split= 4,
                                    max_depth= 20, min_samples_leaf= 2, n_jobs = -1, verbose = 1, random_state = 5)
score = cross_validation.cross_val_score(etc, train_X, train_y, scoring='log_loss', cv=5)
score

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.2s finished
  sample_weight=sample_weight)
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.1s finished
  sample_weight=sample_weight)
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.3s finished
  sample_weight=sample_weight)
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:

array([-0.58372008, -0.55499723, -0.55287353, -0.55738689, -0.5788555 ])

In [26]:
score.mean() * -1

0.57336414428019888

In [30]:
rfc = RandomForestClassifier(n_estimators=200, min_samples_leaf = 2, min_samples_split = 2,
                             max_features=50, max_depth=30, verbose = 1)
score = cross_validation.cross_val_score(rfc, train_X, train_y, scoring='log_loss', cv=2)
score

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   36.4s finished
  sample_weight=sample_weight)
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   37.9s finished
  sample_weight=sample_weight)
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    2.1s finished


array([-0.56904557, -0.57537624])

In [18]:
out_df = pd.DataFrame(preds * 0.8 + et_preds * 0.2)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("cv_lgb_et.csv", index=False)
print(out_df.head())

       high    medium       low  listing_id
0  0.143894  0.657117  0.198988     7142618
1  0.011512  0.022761  0.965727     7210040
2  0.008286  0.166035  0.825680     7103890
3  0.191847  0.617181  0.190973     7143442
4  0.070781  0.320497  0.608721     6860601
