In [10]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics, ensemble, linear_model, svm
from numpy import log, ones, array, zeros, mean, std, repeat
import numpy as np
import scipy.sparse as sp
import re
import csv
from textblob import TextBlob
from time import time


pd.options.display.max_columns=999
#input data
train_df=pd.read_json('../input/train.json')
test_df=pd.read_json('../input/test.json')

In [11]:
from sklearn.cluster import Birch
def cluster_latlon(n_clusters, data):  
    #split the data between "around NYC" and "other locations" basically our first two clusters 
    data_c=data[(data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)]
    data_e=data[~(data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)]
    #put it in matrix form
    coords=data_c.as_matrix(columns=['latitude', "longitude"])
    
    brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01,compute_labels=True)

    brc.fit(coords)
    clusters=brc.predict(coords)
    data_c["cluster_"+str(n_clusters)]=clusters
    data_e["cluster_"+str(n_clusters)]=-1 #assign cluster label -1 for the non NYC listings 
    data=pd.concat([data_c,data_e])
    #plt.scatter(data_c["longitude"], data_c["latitude"], c=data_c["cluster_"+str(n_clusters)], s=10, linewidth=0.1)
    #plt.title(str(n_clusters)+" Neighbourhoods from clustering")
    #plt.show()
    return data 

traingpsclusters=cluster_latlon(200, train_df[['listing_id','latitude','longitude']])
traingpsclusters=traingpsclusters.drop(['latitude','longitude'],axis=1)

testgpsclusters=cluster_latlon(200, test_df[['listing_id','latitude','longitude']])
testgpsclusters=testgpsclusters.drop(['latitude','longitude'],axis=1)

train_df=pd.merge(train_df,traingpsclusters,on='listing_id',how='left')
test_df=pd.merge(test_df,testgpsclusters,on='listing_id',how='left')

clusters_price_map=dict(train_df.groupby(by="cluster_200")["price"].median())
train_df["price_comparison"]=train_df['price']-train_df["cluster_200"].map(clusters_price_map)

clusters_price_map=dict(test_df.groupby(by="cluster_200")["price"].median())
test_df["price_comparison"]=test_df['price']-test_df["cluster_200"].map(clusters_price_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
def create_binary_features(df):
    bows = {
        "dogs": ("dogs", "dog"),
        "cats": ("cats",),
        "nofee": ("no fee", "no-fee", "no  fee", "nofee", "no_fee"),
        "lowfee": ("reduced_fee", "low_fee", "reduced fee", "low fee"),
        "furnished": ("furnished",),
        "parquet": ("parquet", "hardwood"),
        "concierge": ("concierge", "doorman", "housekeep", "in_super"),
        "prewar": ("prewar", "pre_war", "pre war", "pre-war"),
        "laundry": ("laundry", "lndry"),
        "health": ("health", "gym", "fitness", "training"),
        "transport": ("train", "subway", "transport"),
        "parking": ("parking",),
        "utilities": ("utilities", "heat water", "water included")
    }

    def indicator(bow):
        return lambda s: int(any([x in s for x in bow]))

    features = df["features"].apply(lambda f: " ".join(f).lower())   # convert features to string
    featurelist=[]
    for key in bows:
        df["feature_" + key] = features.apply(indicator(bows[key]))
        featurelist.append("feature_" + key)
    return df,featurelist

In [13]:
train_df,featurelist=create_binary_features(train_df)
test_df,featurelist=create_binary_features(test_df)

In [14]:
#basic features
train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df['created'] = pd.to_datetime(train_df['created'])
train_df['date'] = train_df['created'].dt.date
train_df["year"] = train_df["created"].dt.year
train_df['month'] = train_df['created'].dt.month
train_df['day'] = train_df['created'].dt.day
train_df['hour'] = train_df['created'].dt.hour
train_df['weekday'] = train_df['created'].dt.weekday
train_df['week'] = train_df['created'].dt.week
train_df['quarter'] = train_df['created'].dt.quarter
train_df['weekend'] = ((train_df['weekday'] == 5) & (train_df['weekday'] == 6))
train_df['wd'] = ((train_df['weekday'] != 5) & (train_df['weekday'] != 6))

test_df['created'] = pd.to_datetime(test_df['created'])
test_df['date'] = test_df['created'].dt.date
test_df["year"] = test_df["created"].dt.year
test_df['month'] = test_df['created'].dt.month
test_df['day'] = test_df['created'].dt.day
test_df['hour'] = test_df['created'].dt.hour
test_df['weekday'] = test_df['created'].dt.weekday
test_df['week'] = test_df['created'].dt.week
test_df['quarter'] = test_df['created'].dt.quarter
test_df['weekend'] = ((test_df['weekday'] == 5) & (test_df['weekday'] == 6))
test_df['wd'] = ((test_df['weekday'] != 5) & (test_df['weekday'] != 6))

train_df = train_df.join(
                   train_df['description'].apply(
                       lambda x: TextBlob(x).sentiment.polarity).rename('sentiment'))

test_df = test_df.join(
                   test_df['description'].apply(
                       lambda x: TextBlob(x).sentiment.polarity).rename('sentiment'))

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = test_df['pos'].value_counts()
dvals = vals.to_dict()
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

features_to_use=["price_comparison","density","sentiment","wd","weekend","quarter","week","weekday","hour","day","month","year","bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","num_photos", "num_features", "num_description_words","listing_id"]

In [15]:
features_to_use=features_to_use+featurelist

In [17]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [18]:
index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c



a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

In [19]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [20]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

0                                                     
1    Doorman Elevator Fitness_Center Cats_Allowed D...
2    Laundry_In_Building Dishwasher Hardwood_Floors...
3                               Hardwood_Floors No_Fee
4                                              Pre-War
Name: features, dtype: object


In [21]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

TypeError: no supported conversion for types: (dtype('O'), dtype('int64'))

In [15]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.07869	test-mlogloss:1.07925
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:1.05987	test-mlogloss:1.06086
[2]	train-mlogloss:1.04192	test-mlogloss:1.04337
[3]	train-mlogloss:1.02531	test-mlogloss:1.02716
[4]	train-mlogloss:1.00982	test-mlogloss:1.01204
[5]	train-mlogloss:0.993505	test-mlogloss:0.996167
[6]	train-mlogloss:0.978509	test-mlogloss:0.98162
[7]	train-mlogloss:0.964422	test-mlogloss:0.967811
[8]	train-mlogloss:0.951805	test-mlogloss:0.955489
[9]	train-mlogloss:0.938163	test-mlogloss:0.942257
[10]	train-mlogloss:0.924776	test-mlogloss:0.929271
[11]	train-mlogloss:0.913698	test-mlogloss:0.918534
[12]	train-mlogloss:0.901448	test-mlogloss:0.906776
[13]	train-mlogloss:0.890257	test-mlogloss:0.896035
[14]	train-mlogloss:0.879363	test-mlogloss:0.885371
[15]	train-mlogloss:0.869108	test-mlogloss:0.875485
[16]	train-mlogloss:0.85851	test-mlogl

[155]	train-mlogloss:0.536033	test-mlogloss:0.577076
[156]	train-mlogloss:0.535506	test-mlogloss:0.576689
[157]	train-mlogloss:0.534912	test-mlogloss:0.576315
[158]	train-mlogloss:0.534454	test-mlogloss:0.576013
[159]	train-mlogloss:0.533895	test-mlogloss:0.575682
[160]	train-mlogloss:0.533339	test-mlogloss:0.575315
[161]	train-mlogloss:0.532891	test-mlogloss:0.575006
[162]	train-mlogloss:0.532379	test-mlogloss:0.574734
[163]	train-mlogloss:0.531718	test-mlogloss:0.574387
[164]	train-mlogloss:0.531221	test-mlogloss:0.574129
[165]	train-mlogloss:0.530697	test-mlogloss:0.573855
[166]	train-mlogloss:0.530168	test-mlogloss:0.573544
[167]	train-mlogloss:0.529748	test-mlogloss:0.57331
[168]	train-mlogloss:0.529369	test-mlogloss:0.573134
[169]	train-mlogloss:0.529004	test-mlogloss:0.572876
[170]	train-mlogloss:0.528507	test-mlogloss:0.572627
[171]	train-mlogloss:0.527919	test-mlogloss:0.572302
[172]	train-mlogloss:0.527567	test-mlogloss:0.572085
[173]	train-mlogloss:0.527321	test-mlogloss:0.5

[310]	train-mlogloss:0.481977	test-mlogloss:0.552842
[311]	train-mlogloss:0.481688	test-mlogloss:0.552764
[312]	train-mlogloss:0.481501	test-mlogloss:0.55268
[313]	train-mlogloss:0.481249	test-mlogloss:0.552596
[314]	train-mlogloss:0.480963	test-mlogloss:0.552495
[315]	train-mlogloss:0.480802	test-mlogloss:0.55245
[316]	train-mlogloss:0.480692	test-mlogloss:0.552381
[317]	train-mlogloss:0.480426	test-mlogloss:0.552271
[318]	train-mlogloss:0.480271	test-mlogloss:0.552222
[319]	train-mlogloss:0.480024	test-mlogloss:0.552115
[320]	train-mlogloss:0.479777	test-mlogloss:0.552016
[321]	train-mlogloss:0.479421	test-mlogloss:0.551869
[322]	train-mlogloss:0.479352	test-mlogloss:0.551853
[323]	train-mlogloss:0.479033	test-mlogloss:0.551751
[324]	train-mlogloss:0.478824	test-mlogloss:0.551663
[325]	train-mlogloss:0.478557	test-mlogloss:0.551535
[326]	train-mlogloss:0.478452	test-mlogloss:0.551476
[327]	train-mlogloss:0.478137	test-mlogloss:0.551397
[328]	train-mlogloss:0.477832	test-mlogloss:0.55

[466]	train-mlogloss:0.448468	test-mlogloss:0.544347
[467]	train-mlogloss:0.448275	test-mlogloss:0.544321
[468]	train-mlogloss:0.448018	test-mlogloss:0.544249
[469]	train-mlogloss:0.447857	test-mlogloss:0.54423
[470]	train-mlogloss:0.447689	test-mlogloss:0.544195
[471]	train-mlogloss:0.447439	test-mlogloss:0.544145
[472]	train-mlogloss:0.447286	test-mlogloss:0.544093
[473]	train-mlogloss:0.447152	test-mlogloss:0.544057
[474]	train-mlogloss:0.446955	test-mlogloss:0.544046
[475]	train-mlogloss:0.446733	test-mlogloss:0.544025
[476]	train-mlogloss:0.446497	test-mlogloss:0.543938
[477]	train-mlogloss:0.446393	test-mlogloss:0.543918
[478]	train-mlogloss:0.446189	test-mlogloss:0.543876
[479]	train-mlogloss:0.445981	test-mlogloss:0.543817
[480]	train-mlogloss:0.445729	test-mlogloss:0.543721
[481]	train-mlogloss:0.445531	test-mlogloss:0.54369
[482]	train-mlogloss:0.4453	test-mlogloss:0.543617
[483]	train-mlogloss:0.445116	test-mlogloss:0.543523
[484]	train-mlogloss:0.444889	test-mlogloss:0.5435

[622]	train-mlogloss:0.42097	test-mlogloss:0.539447
[623]	train-mlogloss:0.420705	test-mlogloss:0.539392
[624]	train-mlogloss:0.420546	test-mlogloss:0.53939
[625]	train-mlogloss:0.420431	test-mlogloss:0.53938
[626]	train-mlogloss:0.420267	test-mlogloss:0.539337
[627]	train-mlogloss:0.420188	test-mlogloss:0.539337
[628]	train-mlogloss:0.419977	test-mlogloss:0.539272
[629]	train-mlogloss:0.419823	test-mlogloss:0.539265
[630]	train-mlogloss:0.4196	test-mlogloss:0.539219
[631]	train-mlogloss:0.419436	test-mlogloss:0.539172
[632]	train-mlogloss:0.419308	test-mlogloss:0.539146
[633]	train-mlogloss:0.419273	test-mlogloss:0.539143
[634]	train-mlogloss:0.419154	test-mlogloss:0.539121
[635]	train-mlogloss:0.418992	test-mlogloss:0.539114
[636]	train-mlogloss:0.4188	test-mlogloss:0.539104
[637]	train-mlogloss:0.418666	test-mlogloss:0.539079
[638]	train-mlogloss:0.418435	test-mlogloss:0.53901
[639]	train-mlogloss:0.418251	test-mlogloss:0.538976
[640]	train-mlogloss:0.418133	test-mlogloss:0.538936
[

[778]	train-mlogloss:0.39788	test-mlogloss:0.536954
[779]	train-mlogloss:0.397728	test-mlogloss:0.536914
[780]	train-mlogloss:0.397607	test-mlogloss:0.5369
[781]	train-mlogloss:0.397525	test-mlogloss:0.536871
[782]	train-mlogloss:0.397393	test-mlogloss:0.536862
[783]	train-mlogloss:0.397241	test-mlogloss:0.536823
[784]	train-mlogloss:0.397058	test-mlogloss:0.536823
[785]	train-mlogloss:0.396968	test-mlogloss:0.536826
[786]	train-mlogloss:0.396752	test-mlogloss:0.536787
[787]	train-mlogloss:0.396663	test-mlogloss:0.536729
[788]	train-mlogloss:0.396544	test-mlogloss:0.536726
[789]	train-mlogloss:0.396369	test-mlogloss:0.536716
[790]	train-mlogloss:0.396266	test-mlogloss:0.536694
[791]	train-mlogloss:0.396172	test-mlogloss:0.536671
[792]	train-mlogloss:0.396032	test-mlogloss:0.536674
[793]	train-mlogloss:0.395992	test-mlogloss:0.536661
[794]	train-mlogloss:0.395874	test-mlogloss:0.536669
[795]	train-mlogloss:0.395766	test-mlogloss:0.536692
[796]	train-mlogloss:0.395563	test-mlogloss:0.536

[934]	train-mlogloss:0.376799	test-mlogloss:0.535731
[935]	train-mlogloss:0.376649	test-mlogloss:0.535704
[936]	train-mlogloss:0.376588	test-mlogloss:0.535695
[937]	train-mlogloss:0.376462	test-mlogloss:0.535706
[938]	train-mlogloss:0.376354	test-mlogloss:0.535715
[939]	train-mlogloss:0.376291	test-mlogloss:0.535715
[940]	train-mlogloss:0.376173	test-mlogloss:0.535689
[941]	train-mlogloss:0.376132	test-mlogloss:0.535687
[942]	train-mlogloss:0.376045	test-mlogloss:0.535698
[943]	train-mlogloss:0.375907	test-mlogloss:0.535704
[944]	train-mlogloss:0.375769	test-mlogloss:0.535669
[945]	train-mlogloss:0.37567	test-mlogloss:0.535678
[946]	train-mlogloss:0.375617	test-mlogloss:0.535661
[947]	train-mlogloss:0.37548	test-mlogloss:0.535644
[948]	train-mlogloss:0.375408	test-mlogloss:0.535663
[949]	train-mlogloss:0.375243	test-mlogloss:0.535652
[950]	train-mlogloss:0.375149	test-mlogloss:0.535621
[951]	train-mlogloss:0.374999	test-mlogloss:0.535607
[952]	train-mlogloss:0.374805	test-mlogloss:0.53

In [16]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_starter2.csv", index=False)