In [None]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer

train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")


In [None]:

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=2000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model



In [None]:
train_df["bedrooms_calc"]=train_df["bedrooms"]
train_df["bedrooms_calc"][train_df["bedrooms_calc"]==0]=.01

train_df["bathrooms_calc"]=train_df["bathrooms"]
train_df["bathrooms_calc"][train_df["bathrooms_calc"]==0]=.01

test_df["bedrooms_calc"]=test_df["bedrooms"]
test_df["bedrooms_calc"][test_df["bedrooms_calc"]==0]=.01

test_df["bathrooms_calc"]=test_df["bathrooms"]
test_df["bathrooms_calc"][test_df["bathrooms_calc"]==0]=.01

In [None]:
test_df["bathrooms"].loc[19671] = 1.5
test_df["bathrooms"].loc[22977] = 2.0
test_df["bathrooms"].loc[63719] = 2.0
train_df["price"] = train_df["price"].clip(upper=13000)

train_df["logprice"] = np.log(train_df["price"])
test_df["logprice"] = np.log(test_df["price"])

train_df["price_t"] =train_df["price"]/train_df["bedrooms_calc"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms_calc"] 

train_df["roomToBathRatio"] =train_df["bedrooms"]/train_df["bathrooms_calc"]
test_df["roomToBathRatio"] = test_df["bedrooms"]/test_df["bathrooms_calc"] 

train_df["room_sum"] = train_df["bedrooms_calc"]+train_df["bathrooms_calc"] 
test_df["room_sum"] = test_df["bedrooms_calc"]+test_df["bathrooms_calc"] 

train_df['price_per_room'] = train_df['price']/train_df['room_sum']
test_df['price_per_room'] = test_df['price']/test_df['room_sum']

#train_df["num_photos"] = train_df["photos"].apply(len)
#test_df["num_photos"] = test_df["photos"].apply(len)

train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

#train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
#test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))


In [None]:
#features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density",
#"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour"]

features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density",
 "num_features", "listing_id", "roomToBathRatio", "created_year", "created_month", "created_day", "created_hour"]



In [None]:
# Not running this

index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)


for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c

a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

#features_to_use.append('manager_level_low') 
#features_to_use.append('manager_level_medium') 
#features_to_use.append('manager_level_high')



In [None]:
## NK additional Features-
# Not running

from sklearn.cluster import KMeans
xl=train_df.longitude
yl=train_df.latitude
xl=(xl-np.mean(xl))/np.std(xl)
yl=(yl-np.mean(yl))/np.std(yl)

a= np.array(xl)
b=np.array(yl)
c=np.column_stack((a,b))
kmeans = KMeans(n_clusters=20)
kmeans.fit_predict(c)
prediction = kmeans.predict(c)
train_df["KMeans_Clusters"]= prediction
#df.groupby("KMeans_Clusters").count()

xl_test=test_df.longitude
yl_test=test_df.latitude
xl_test=(xl_test-np.mean(xl))/np.std(xl)
yl_test=(yl_test-np.mean(yl))/np.std(yl)
a_test= np.array(xl_test)
b_test=np.array(yl_test)
c_test=np.column_stack((a_test,b_test))
kmeans.fit_predict(c_test)
prediction_test = kmeans.predict(c_test)
test_df["KMeans_Clusters"]= prediction_test

cl_count = train_df.groupby("KMeans_Clusters").size()
#type(cl_count)

toBeRemovedCls=[]
for ind in range (len(cl_count)):
    if cl_count[ind]<1000 :
        toBeRemovedCls.append(ind)
        
train_df['KMeans_Clusters']=train_df['KMeans_Clusters'].replace(toBeRemovedCls, len(cl_count)+1)
test_df['KMeans_Clusters']=test_df['KMeans_Clusters'].replace(toBeRemovedCls, len(cl_count)+1)

train_df['KMeans_Clusters']=train_df['KMeans_Clusters'].astype(object)
test_df['KMeans_Clusters']=test_df['KMeans_Clusters'].astype(object)
features_to_use.append('KMeans_Clusters')
#####

In [None]:
#categorical = ["display_address", "manager_id", "building_id", "KMeans_Clusters"]
train_df["building_id1"] = train_df["building_id"]
test_df["building_id1"] = test_df["building_id"]

#categorical = ["display_address", "manager_id", "building_id"]
categorical = ["display_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)


In [None]:
## NK additional Features

#a_train=max(train_df['created'])-train_df['created']
#a_test =max(test_df['created'])-test_df['created']

#train_df['daysFromCreated']=a_train/np.timedelta64(1, 'D')
#test_df['daysFromCreated']=a_test/np.timedelta64(1, 'D')
#features_to_use.append('daysFromCreated')
###

In [None]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])


In [None]:
train_df['is_building_id_Av']=1
train_df['is_building_id_Av'][train_df['building_id']=='0']=0

train_df['is_description_Av']=1
train_df['is_description_Av'][train_df['description']=='']=0

train_df['is_features_Av']=1
train_df['is_features_Av'][train_df['features']=='']=0


test_df['is_building_id_Av']=1
test_df['is_building_id_Av'][test_df['building_id']=='0']=0

test_df['is_description_Av']=1
test_df['is_description_Av'][test_df['description']=='']=0

test_df['is_features_Av']=1
test_df['is_features_Av'][test_df['features']=='']=0


#features_to_use.append('is_building_id_Av')
features_to_use.append('is_description_Av')
#features_to_use.append('is_features_Av')

In [None]:
## NK for cross validation
train_df['is_train'] = np.random.uniform(0, 1, len(train_df)) <= .80
train_train_df, test_train_df = train_df[train_df['is_train']==True], train_df[train_df['is_train']==False]

train_df_tr_sparse = tfidf.fit_transform(train_train_df["features"])
train_df_te_sparse = tfidf.transform(test_train_df["features"])

##NK

In [None]:
## NK
train_X_cv = sparse.hstack([train_train_df[features_to_use], train_df_tr_sparse]).tocsr()
test_X_cv = sparse.hstack([test_train_df[features_to_use], train_df_te_sparse]).tocsr()

##

In [None]:
## NK
target_num_map = {'high':0, 'medium':1, 'low':2}
train_train_y = np.array(train_train_df['interest_level'].apply(lambda x: target_num_map[x]))
test_train_y = np.array(test_train_df['interest_level'].apply(lambda x: target_num_map[x]))
###

In [None]:
## NK
preds, model = runXGB(train_X_cv, train_train_y, test_X_cv, test_y=test_train_y, num_rounds=2000)
##

In [None]:
##NK GBM
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions

##

In [None]:
###NK GBM
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain['Disbursed'], cv=cv_folds, scoring='roc_auc')
    
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
    
    if performCV:
        print "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
        

##############

In [None]:
###########NK  GBM
gbm0 = GradientBoostingClassifier(learning_rate=0.02, max_depth=8, n_estimators=450, subsample=0.7)
###########

In [None]:
###########NK  GBM
gbm0.fit(train_X_cv, train_train_y)
#############

In [None]:
###########NK  GBM
cv_score = cross_validation.cross_val_score(gbm0, train_X_cv, train_train_y, cv=5, scoring='mlogloss')
#############

In [None]:
pred_gbm_probs=gbm0.predict_proba(test_X_cv)

In [None]:
log_loss(test_train_y,pred_gbm_probs)

In [None]:
##NK Random Forest

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000,max_depth=50)
clf.fit(train_X_cv, train_train_y)

####

##NK Random Forest

#preds_rf=clf.predict(test_X_cv)
pred_rf_probs = clf.predict_proba(test_X_cv)
####

log_loss(test_train_y,pred_rf_probs)

In [None]:
log_loss(test_train_y,(pred_rf_probs+preds)/2)

In [None]:
log_loss(test_train_y,(preds))

In [None]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

preds, model = runXGB(train_X, train_y, test_X, num_rounds=2000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("sub51.csv", index=False)