In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")

In [None]:
#Import the data
train_data = pd.read_json('../input/train.json')
test_data = pd.read_json('../input/test.json')
train_data.head(1)

In [None]:
#Check for missing values
train_data.isnull().sum()

In [None]:
#Explore the training data - interest_level 
sns.countplot(train_data.interest_level, order=['low', 'medium', 'high']);
plt.xlabel('Interest Level');
plt.ylabel('Number of occurrences');

In [None]:
sns.countplot(train_data.bathrooms);
plt.xlabel('Bathrooms');
plt.ylabel('Number of occurrences');

In [None]:
sns.countplot(train_data.bedrooms);
plt.xlabel('Bedrooms');
plt.ylabel('Number of occurrences');

In [None]:
sns.stripplot(x="bedrooms", y="price", data=train_data, jitter=True);

In [None]:
sns.lmplot(x="longitude", y="latitude", fit_reg=False, hue='interest_level',
           hue_order=['low', 'medium', 'high'], size=9, scatter_kws={'alpha':0.4,'s':30},
           data=train_data[(train_data.longitude>train_data.longitude.quantile(0.005))
                           &(train_data.longitude<train_data.longitude.quantile(0.995))
                           &(train_data.latitude>train_data.latitude.quantile(0.005))                           
                           &(train_data.latitude<train_data.latitude.quantile(0.995))]);
plt.xlabel('Longitude');
plt.ylabel('Latitude');

In [None]:
train_data["Source"]='train'
test_data["Source"]='test'
data=pd.concat([train_data, test_data]) 

In [None]:
data["num_photos"]=data["photos"].apply(len)
data["num_features"]=data["features"].apply(len)
data["num_description_words"] = data["description"].apply(lambda x: len(x.split(" ")))

In [None]:
data["created"]=pd.to_datetime(data["created"])
data["created_month"]=data["created"].dt.month
data["created_day"]=data["created"].dt.day
data["created_hour"]=data["created"].dt.hour

In [None]:
features_to_use  = ["bathrooms", "bedrooms", "price",                             
                    "num_photos", "num_features", "num_description_words",                    
                    "created_month", "created_day", "created_hour"
                   ]

In [None]:
train=data[data["Source"]=="train"]
test=data[data["Source"]=="test"]
target_num_map={"high":0, "medium":1, "low":2}
y=np.array(train["interest_level"].apply(lambda x: target_num_map[x]))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val,y_train, y_val =train_test_split( train[features_to_use], y, test_size=0.33, random_state=42)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier as RFC, VotingClassifier

clf1 = RFC(n_estimators=1000, random_state=42)
clf2 = GradientBoostingClassifier()
classifier = VotingClassifier(estimators=[('rfc', clf1), ('gbc', clf2)], voting='soft', weights = [8,4])
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import log_loss
y_val_pred = classifier.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [None]:
from sklearn.cluster import Birch
def cluster_latlon(n_clusters, data ):  
    #split the data between "around NYC" and "other locations" basically our first two clusters 
    data_c=data[(data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)]
    data_e=data[~(data.longitude>-74.05)&(data.longitude<-73.75)&(data.latitude>40.4)&(data.latitude<40.9)]
    #put it in matrix form
    coords=data_c.as_matrix(columns=['latitude', "longitude"])
    
    brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01,compute_labels=True, copy =True)

    brc.fit(coords)
    clusters=brc.predict(coords)
    data_c["cluster_"+str(n_clusters)]=clusters
    data_e["cluster_"+str(n_clusters)]=-1 #assign cluster label -1 for the non NYC listings 
    data=pd.concat([data_c,data_e])
    plt.scatter(data_c["longitude"], data_c["latitude"], c=data_c["cluster_"+str(n_clusters)], s=10, linewidth=0.1)
    plt.title(str(n_clusters)+" Neighbourhoods from clustering")
    plt.show()
    return data 

In [None]:
cluster_latlon(12, data)


In [None]:
from sklearn.metrics import log_loss


def compute_logloss(n_cluster,data):
    data_cluster=cluster_latlon(n_cluster,data)
    train=data_cluster[data_cluster["Source"]=="train"]

    target_num_map={"high":0, "medium":1, "low":2}
    y=np.array(train["interest_level"].apply(lambda x: target_num_map[x]))
    
    features = ["bathrooms", "bedrooms", "price", 
                                                        
                    "num_photos", "num_features", "num_description_words",                    
                    "created_month", "created_day", "created_hour", "cluster_"+str(n_cluster)
                   ]
    
    X_train, X_val,y_train, y_val =train_test_split( train[features], y, test_size=0.33, random_state=42)
    clf.fit(X_train, y_train)

    y_val_pred = clf.predict_proba(X_val)
    return log_loss(y_val, y_val_pred)

In [None]:
compute_logloss(3, data)

In [None]:
log_loss_cls={}
for n in range(4,15):
    log_loss_cls[n]=compute_logloss(n, data)
    
n_c = sorted(log_loss_cls.items()) 
x, y = zip(*n_c) 
plt.plot(x, y)
plt.title("log_loss for different numbers of clusters")
plt.show()

In [None]:
log_loss_cls