In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob
#input data
train_df=pd.read_json('../input/train.json')
test_df=pd.read_json('../input/test.json')

In [2]:
from sklearn.cluster import Birch
def cluster_latlon(n_clusters, data):  
    #split the data between "around NYC" and "other locations" basically our first two clusters 
    data_c=data[(data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)]
    data_e=data[~(data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)]
    #put it in matrix form
    coords=data_c.as_matrix(columns=['latitude', "longitude"])
    
    brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01,compute_labels=True)

    brc.fit(coords)
    clusters=brc.predict(coords)
    data_c["cluster_"+str(n_clusters)]=clusters
    data_e["cluster_"+str(n_clusters)]=-1 #assign cluster label -1 for the non NYC listings 
    data=pd.concat([data_c,data_e])
    #plt.scatter(data_c["longitude"], data_c["latitude"], c=data_c["cluster_"+str(n_clusters)], s=10, linewidth=0.1)
    #plt.title(str(n_clusters)+" Neighbourhoods from clustering")
    #plt.show()
    return data 

traingpsclusters=cluster_latlon(200, train_df[['listing_id','latitude','longitude']])
traingpsclusters=traingpsclusters.drop(['latitude','longitude'],axis=1)

testgpsclusters=cluster_latlon(200, test_df[['listing_id','latitude','longitude']])
testgpsclusters=testgpsclusters.drop(['latitude','longitude'],axis=1)

train_df=pd.merge(train_df,traingpsclusters,on='listing_id',how='left')
test_df=pd.merge(test_df,testgpsclusters,on='listing_id',how='left')

clusters_price_map=dict(train_df.groupby(by="cluster_200")["price"].median())
train_df["price_comparison"]=train_df['price']-train_df["cluster_200"].map(clusters_price_map)

clusters_price_map=dict(test_df.groupby(by="cluster_200")["price"].median())
test_df["price_comparison"]=test_df['price']-test_df["cluster_200"].map(clusters_price_map)

def create_binary_features(df):
    bows = {
        "dogs": ("dogs", "dog"),
        "cats": ("cats",),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished",),
        "parquet": ("parquet", "hardwood"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking",),
        "utilities": ("utilities", "heat water", "water included")
    }

    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())   # convert features to string
    featurelist=[]
    for key in bows:
        df["feature_" + key] = features.apply(indicator(bows[key]))
        featurelist.append("feature_" + key)
    return df,featurelist

train_df,featurelist=create_binary_features(train_df)
test_df,featurelist=create_binary_features(test_df)

  % (len(centroids), self.n_clusters))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  % (len(centroids), self.n_clusters))


In [3]:
#basic features
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))


train_df['created'] = pd.to_datetime(train_df['created'])
train_df['date'] = train_df['created'].dt.date
train_df["year"] = train_df["created"].dt.year
train_df['month'] = train_df['created'].dt.month
train_df['day'] = train_df['created'].dt.day
train_df['hour'] = train_df['created'].dt.hour
train_df['weekday'] = train_df['created'].dt.weekday
train_df['week'] = train_df['created'].dt.week
train_df['quarter'] = train_df['created'].dt.quarter
train_df['weekend'] = ((train_df['weekday'] == 5) & (train_df['weekday'] == 6))
train_df['wd'] = ((train_df['weekday'] != 5) & (train_df['weekday'] != 6))

test_df['created'] = pd.to_datetime(test_df['created'])
test_df['date'] = test_df['created'].dt.date
test_df["year"] = test_df["created"].dt.year
test_df['month'] = test_df['created'].dt.month
test_df['day'] = test_df['created'].dt.day
test_df['hour'] = test_df['created'].dt.hour
test_df['weekday'] = test_df['created'].dt.weekday
test_df['week'] = test_df['created'].dt.week
test_df['quarter'] = test_df['created'].dt.quarter
test_df['weekend'] = ((test_df['weekday'] == 5) & (test_df['weekday'] == 6))
test_df['wd'] = ((test_df['weekday'] != 5) & (test_df['weekday'] != 6))

train_df = train_df.join(
                   train_df['description'].apply(
                       lambda x: TextBlob(x).sentiment.polarity).rename('sentiment'))

test_df = test_df.join(
                   test_df['description'].apply(
                       lambda x: TextBlob(x).sentiment.polarity).rename('sentiment'))

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = test_df['pos'].value_counts()
dvals = vals.to_dict()
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

features_to_use=["price_comparison","density","sentiment","wd","weekend","quarter","week","weekday","hour","day","month","year","bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","num_photos", "num_features", "num_description_words","listing_id"]
features_to_use=features_to_use+featurelist

In [34]:
import lightgbm as lgbm
def runlgbm(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=500000,e_stoping_r=50): 
    t4_params = {
        'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
        'num_leaves': 6, 'learning_rate': 0.03, 'max_depth': 6,
        'max_bin': 255, 'subsample_for_bin': 50000,
        'subsample': 0.6, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha':1, 'reg_lambda':0,
        'min_split_gain': 0.5, 'min_child_weight': 1, 'min_child_samples': 10, 'scale_pos_weight': 1}

    # they can be used directly to build a LGBMClassifier (which is wrapped in a sklearn fashion)
    model = lgbm.sklearn.LGBMClassifier(n_estimators=num_rounds, seed=0, **t4_params)
    
    if test_y is not None:
        model.fit(train_X,train_y,eval_set=[(train_X,train_y),(test_X, test_y)],verbose=100,early_stopping_rounds=e_stoping_r)
    else:
        model.fit(train_X,train_y)
    pred_test_y = model.predict_proba(test_X)
    return pred_test_y, model


def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [5]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [6]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [7]:
import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho


def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi


def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)


def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)


def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

In [8]:
allcols=train_df.columns.tolist()

In [9]:
train_df, test_df = operate_on_coordinates(train_df, test_df)

In [10]:
import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
    df['num_cap_share'] = df['description'].apply(cap_share)
    
    # how long in lines the desc is?
    df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
   
    # is the description redacted by the website?        
    df['num_redacted'] = 0
    df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

    
    # can we contact someone via e-mail to ask for the details?
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    
    #and... can we call them?
    
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
    def try_and_find_nr(description):
        if reg.match(description) is None:
            return 0
        return 1

    df['num_phone_nr'] = df['description'].apply(try_and_find_nr)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [17]:
gpdand_desnewfeaure=[col for col in train_df.columns if col not in allcols]

In [24]:
features_to_use.extend(gpdand_desnewfeaure)

In [26]:
# 3481 unique managers
temp = train_df.groupby('manager_id').count().iloc[:,-1]
temp2 = test_df.groupby('manager_id').count().iloc[:,-1]
train_managers = pd.concat([temp,temp2],axis=1,join='outer')
train_managers.columns=['train_count','test_count']
#print(train_managers.sort_values(by = 'train_count',ascending = False).head())
# considering only those manager_ids which are in train
man_list = train_managers['train_count'].sort_values(ascending = False).head(3481).index
ixes = train_df.manager_id.isin(man_list)
train10 = train_df[ixes][['manager_id','interest_level']]
# create dummies of interest levels
interest_dummies = pd.get_dummies(train10.interest_level)
train10 = pd.concat([train10,interest_dummies[['low','medium','high']]], axis = 1).drop('interest_level', axis = 1)
#print(train10.head())
gby = pd.concat([train10.groupby('manager_id').mean(),train10.groupby('manager_id').count()], axis = 1).iloc[:,:-2]
gby.columns = ['low','medium','high','count']
gby.sort_values(by = 'count', ascending = False).head(10)
gby['manager_skill'] = gby['medium']*1 + gby['high']*2 
gby['manager_id']=gby.index
#print(gby.head(5))
#print(gby.shape)

In [27]:
train_df = pd.merge(train_df,gby[['manager_id','manager_skill']],on='manager_id',how='left')
train_df['manager_skill']=train_df['manager_skill'].fillna(0)
test_df = pd.merge(test_df,gby[['manager_id','manager_skill']],on='manager_id',how='left')
test_df['manager_skill']=test_df['manager_skill'].fillna(0)

features_to_use.append('manager_skill')

In [28]:
train_df["features"]=train_df["features"].fillna("empty")
test_df["features"]=test_df["features"].fillna("empty")

In [29]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

In [30]:
train_df['wd']=train_df['wd'].astype(int)
train_df['weekend']=train_df['weekend'].astype(int)

test_df['wd']=test_df['wd'].astype(int)
test_df['weekend']=test_df['weekend'].astype(int)

In [31]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

((49352, 258), (74659, 258))


In [None]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runlgbm(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)

In [35]:
#preds, model = runlgbm(train_X, train_y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.20, random_state=42)

In [36]:
preds, model = runlgbm(X_train, y_train,X_test,y_test,num_rounds=500000)

Train until valid scores didn't improve in 50 rounds.
[100]	valid_0's multi_logloss: 0.589483	valid_1's multi_logloss: 0.599566
[200]	valid_0's multi_logloss: 0.542201	valid_1's multi_logloss: 0.555292
[300]	valid_0's multi_logloss: 0.522523	valid_1's multi_logloss: 0.538121
[400]	valid_0's multi_logloss: 0.509164	valid_1's multi_logloss: 0.527624
[500]	valid_0's multi_logloss: 0.498251	valid_1's multi_logloss: 0.519453
[600]	valid_0's multi_logloss: 0.489303	valid_1's multi_logloss: 0.512841
[700]	valid_0's multi_logloss: 0.481457	valid_1's multi_logloss: 0.507615
[800]	valid_0's multi_logloss: 0.474562	valid_1's multi_logloss: 0.503475
[900]	valid_0's multi_logloss: 0.468445	valid_1's multi_logloss: 0.500001
[1000]	valid_0's multi_logloss: 0.462827	valid_1's multi_logloss: 0.496819
[1100]	valid_0's multi_logloss: 0.457656	valid_1's multi_logloss: 0.493888
[1200]	valid_0's multi_logloss: 0.452985	valid_1's multi_logloss: 0.491654
[1300]	valid_0's multi_logloss: 0.448495	valid_1's mult

In [37]:
preds, model = runlgbm(train_X, train_y,test_X,num_rounds=4095)

In [39]:
#preds=model.predict_proba(test_X)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]

In [40]:
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_starter441.csv", index=False)

In [68]:
out_df = pd.DataFrame(preds)
out_df.columns = ["score"]

In [69]:
target_num_map_inv = {0:'high', 1:'medium', 2:'low'}
out_df['score']= out_df['score'].apply(lambda x: target_num_map_inv[x])
dummy=pd.get_dummies(out_df['score'])

In [70]:
out_df=pd.concat([out_df,dummy],axis=1)

In [71]:
out_df["listing_id"] = test_df.listing_id.values

In [72]:
out_df[["listing_id","high","medium","low"]].to_csv("submission_dummy.csv",index=False)