In [None]:
import numpy as np 
import pandas as pd 

# load data

data_train = pd.read_json('../input/train.json')
data_test = pd.read_json('../input/test.json')

In [None]:
# cluster the training lat longs to find 10 centroids

latlong_X = pd.DataFrame()
latlong_X['latitude'] = data_train.latitude.values
latlong_X['longitude'] = data_train.longitude.values
from sklearn.cluster import KMeans
latlong_kmeans = KMeans(n_clusters=10, random_state=1234).fit(latlong_X)
kmeans_centroids = latlong_kmeans.cluster_centers_

print(kmeans_centroids)

In [None]:
# spare matrix of features

features = data_train['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
features_sub = data_test['features'].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

from sklearn.feature_extraction.text import CountVectorizer

feat_cv = CountVectorizer(stop_words='english', max_features=200)
features_sparse = feat_cv.fit_transform(features)
features_sparse_sub = feat_cv.transform(features_sub)

In [None]:
# data prep function
def prep_input(X, kmeans_centroids):
    
    X['longlat'] = X['longitude'] * X['latitude']
    X['price2'] = X['price'] ** 2
    X['price3'] = X['price'] ** 3
    X['bedrooms2'] = X['bedrooms'] ** 2
    X['bedrooms3'] = X['bedrooms'] ** 3
    X['bathrooms2'] = X['bathrooms'] ** 2
    X['bathrooms3'] = X['bathrooms'] ** 3
    X['bedrooms+bathrooms'] = X['bedrooms'] + X['bathrooms']
    X['bedrooms*bathrooms'] = X['bedrooms'] * X['bathrooms']
    X['bedrooms2+bathrooms2'] = X['bedrooms']**2 + X['bathrooms']**2
    X['bedrooms2*bathrooms2'] = X['bedrooms']**2 * X['bathrooms']**2
    X['bedrooms3+bathrooms3'] = X['bedrooms']**3 + X['bathrooms']**3
    X['bedrooms3*bathrooms3'] = X['bedrooms']**3 * X['bathrooms']**3
    X['priceperbed'] = X['price'] / np.maximum(X['bedrooms'],1)
    X['priceperbath'] = X['price'] / np.maximum(X['bathrooms'],1)
    X['priceperroom'] = X['price'] / np.maximum(X['bedrooms'] + X['bathrooms'],1)
    
    
    X['centroid0'] = np.sqrt((kmeans_centroids[0][0] - X['latitude'])**2 + (kmeans_centroids[0][1] - X['longitude'])**2)
    X['centroid1'] = np.sqrt((kmeans_centroids[1][0] - X['latitude'])**2 + (kmeans_centroids[1][1] - X['longitude'])**2)
    X['centroid2'] = np.sqrt((kmeans_centroids[2][0] - X['latitude'])**2 + (kmeans_centroids[2][1] - X['longitude'])**2)
    X['centroid3'] = np.sqrt((kmeans_centroids[3][0] - X['latitude'])**2 + (kmeans_centroids[3][1] - X['longitude'])**2)
    X['centroid4'] = np.sqrt((kmeans_centroids[4][0] - X['latitude'])**2 + (kmeans_centroids[4][1] - X['longitude'])**2)
    X['centroid5'] = np.sqrt((kmeans_centroids[5][0] - X['latitude'])**2 + (kmeans_centroids[5][1] - X['longitude'])**2)
    X['centroid6'] = np.sqrt((kmeans_centroids[6][0] - X['latitude'])**2 + (kmeans_centroids[6][1] - X['longitude'])**2)
    X['centroid7'] = np.sqrt((kmeans_centroids[7][0] - X['latitude'])**2 + (kmeans_centroids[7][1] - X['longitude'])**2)
    X['centroid8'] = np.sqrt((kmeans_centroids[8][0] - X['latitude'])**2 + (kmeans_centroids[8][1] - X['longitude'])**2)
    X['centroid9'] = np.sqrt((kmeans_centroids[9][0] - X['latitude'])**2 + (kmeans_centroids[9][1] - X['longitude'])**2)

    X['num_features'] = X['features'].apply(len)   
      
    X = X.drop(['building_id','description','created','display_address','features','manager_id','photos','street_address'], 1)
    
    return X

X = prep_input(data_train.drop('interest_level',1), kmeans_centroids)
y = data_train['interest_level']
X_sub = prep_input(data_test, kmeans_centroids)

X

In [None]:
# join sparse features into features

from scipy import sparse
columns = X.columns.values
X = sparse.hstack([X, features_sparse]).tocsr()
X_sub = sparse.hstack([X_sub, features_sparse_sub]).tocsr()

In [None]:
# split into train and test for local validation

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
# fit model

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, verbose=1, n_jobs=-1)
rf = rf.fit(X_train, y_train)



In [None]:
# print feature importances

list(sorted(zip(rf.feature_importances_, list(columns) + list(feat_cv.get_feature_names())), reverse=True))


In [None]:
# local validation using logloss

from sklearn.metrics import log_loss

y_pred = rf.predict_proba(X_test)
print('log_loss',log_loss(y_test, y_pred))

In [None]:
# re-fit model on all training data

rf = rf.fit(X, y)
y_sub = rf.predict_proba(X_sub)

df =  pd.DataFrame(y_sub)
df.columns = rf.classes_


In [None]:
df["listing_id"] = data_test.listing_id.values

df.to_csv("sub1.csv", index=False)