In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.neural_network import MLPClassifier
#from sklearn.svm import SVC
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# read data

In [None]:
df = pd.read_json(open("../input/train.json", "r"))

df['response'] = 0.
df.loc[df.interest_level=='medium', 'response'] = 0.5
df.loc[df.interest_level=='high', 'response'] = 1
df['mm']=df['response']

In [None]:
print(df.shape)

In [None]:
#df.head()

In [None]:
res = 10 # grid size
min_n = 30 # minimum size to perform inference

# Define grids
nx = np.linspace(df.longitude.min(), df.longitude.max(), res)
ny = np.linspace(df.latitude.min(), df.latitude.max(), res)
# Encode
Y = pd.DataFrame()
for i in range(res-1):
    for j in range(res-1):
        # Identify listings within the square
        ix = (df.longitude >= nx[i])&(df.longitude < nx[i+1])&(df.latitude >= ny[j])&(df.latitude < ny[j+1])
        # Compute mean interest if the number of listings is greated than 'min_n'
        if ix.sum() > min_n:
            y = df.loc[ix, :].mean() # mean interest
            y['n'] = ix.sum() # volume
            Y = pd.concat([Y, y], axis=1)
        #print(y['response'])
            df['mm']=df['mm']+y['response']*ix

In [None]:
# Show location coordinates before oulier removal
fig, ax = plt.subplots(1, 2, figsize=(9,6))
print('Length before removing ouliers', len(df))
ax[0].plot(df.longitude, df.latitude, '.');
ax[0].set_title('Before outlier removal');
ax[0].set_xlabel('Longitude');
ax[0].set_ylabel('Latitude');
# Outlier removal
for i in ['latitude', 'longitude']:
    while(1):
        
        x = df[i].median()
        ix = abs(df[i] - x) > 3*df[i].std()
        
        if ix.sum()==0: # no more outliers -> stop
            break
        
        df.loc[ix, i] = np.nan # exclude outliers

# Keep only non-outlier listings
df = df.loc[df[['latitude', 'longitude']].isnull().sum(1) == 0, :]
print('Length after removing ouliers', len(df))
# Show location coordinates after outlier removal
ax[1].plot(df.longitude, df.latitude, 'r.');
ax[1].set_title('After outlier removal');
ax[1].set_xlabel('Longitude');
ax[1].set_ylabel('Latitude');

# naive feature engineering

In [None]:
    r = KMeans(20, random_state=1)
    # Normalize (longitude, latitude) before K-means
    temp = df[['longitude', 'latitude']].copy()
    temp['longitude'] = (temp['longitude']-temp['longitude'].mean())/temp['longitude'].std()
    temp['latitude'] = (temp['latitude']-temp['latitude'].mean())/temp['latitude'].std()
    # Fit k-means and get labels
    r.fit(temp[['longitude', 'latitude']])
    df['labels'] = r.labels_

In [None]:
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day

plot the k-mean cluster

    # Plot results
    ncomp=20
    cols = sns.color_palette("Set2", n_colors=ncomp, desat=.5)
    cl = [cols[i] for i in r.labels_]
    area = 12
    ax[ix].scatter(df.longitude, df.latitude, s=area, c=cl, alpha=0.5);
    ax[ix].set_title('Number of components: ' + str(20))
    ax[ix].set_xlabel('Longitude')
    ax[ix].set_ylabel('Latitude')
    # Show aggregated volume and interest at each neighborhood
    x = df.groupby('labels')[['longitude','latitude','response']].mean().sort_values(['response'])
    x = pd.concat([x, df['labels'].value_counts()], axis=1).sort_values(['response'])
    cols = sns.color_palette("RdBu_r", ncomp)[::-1]
    for i in range(20):
        props = dict(boxstyle='round', facecolor=cols[i], alpha=0.8)
        ax[ix].text(x.longitude.values[i], x.latitude.values[i], 
                str(np.array(np.round(x.response.values,2), '|S8')[i])+'\n'+str(np.array(x['labels'].values, '|S8')[i]), 
                fontsize=9, verticalalignment='center', horizontalalignment='center', bbox=props);

In [None]:
# cheng, if you want to change feature just modify following staff
#num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
#             "num_photos", "num_features", "num_description_words",
#             "created_year", "created_month", "created_day"]
num_feats = ["bathrooms", "bedrooms", "num_photos","price",'mm']
X = df[num_feats]
y = df["interest_level"]
X.head()

# train model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [None]:
#clf=SVC(probability=True)
clf = RandomForestClassifier(n_estimators=1000)
#clf=MLPClassifier(hidden_layer_sizes=(10, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)

# make prediction

df = pd.read_json(open("../input/test.json", "r"))
print(df.shape)
r = KMeans(20, random_state=1)
# Normalize (longitude, latitude) before K-means
temp = df[['longitude', 'latitude']].copy()
temp['longitude'] = (temp['longitude']-temp['longitude'].mean())/temp['longitude'].std()
temp['latitude'] = (temp['latitude']-temp['latitude'].mean())/temp['latitude'].std()
# Fit k-means and get labels
r.fit(temp[['longitude', 'latitude']])
df['labels'] = r.labels_




df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
X = df[num_feats]

y = clf.predict_proba(X)

labels2idx = {label: i for i, label in enumerate(clf.classes_)}
labels2idx

sub = pd.DataFrame()
sub["listing_id"] = df["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv("submission_rf.csv", index=False)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.neural_network import MLPClassifier
#from sklearn.svm import SVC
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# read data

In [None]:
df = pd.read_json(open("../input/train.json", "r"))

df['response'] = 0.
df.loc[df.interest_level=='medium', 'response'] = 0.5
df.loc[df.interest_level=='high', 'response'] = 1
df['mm']=df['response']

In [None]:
print(df.shape)

In [None]:
#df.head()

In [None]:
res = 10 # grid size
min_n = 30 # minimum size to perform inference

# Define grids
nx = np.linspace(df.longitude.min(), df.longitude.max(), res)
ny = np.linspace(df.latitude.min(), df.latitude.max(), res)
# Encode
Y = pd.DataFrame()
for i in range(res-1):
    for j in range(res-1):
        # Identify listings within the square
        ix = (df.longitude >= nx[i])&(df.longitude < nx[i+1])&(df.latitude >= ny[j])&(df.latitude < ny[j+1])
        # Compute mean interest if the number of listings is greated than 'min_n'
        if ix.sum() > min_n:
            y = df.loc[ix, :].mean() # mean interest
            y['n'] = ix.sum() # volume
            Y = pd.concat([Y, y], axis=1)
        #print(y['response'])
            df['mm']=df['mm']+y['response']*ix

# Show location coordinates before oulier removal
fig, ax = plt.subplots(1, 2, figsize=(9,6))
print('Length before removing ouliers', len(df))
ax[0].plot(df.longitude, df.latitude, '.');
ax[0].set_title('Before outlier removal');
ax[0].set_xlabel('Longitude');
ax[0].set_ylabel('Latitude');
# Outlier removal
for i in ['latitude', 'longitude']:
    while(1):
        
        x = df[i].median()
        ix = abs(df[i] - x) > 3*df[i].std()
        
        if ix.sum()==0: # no more outliers -> stop
            break
        
        df.loc[ix, i] = np.nan # exclude outliers

# Keep only non-outlier listings
df = df.loc[df[['latitude', 'longitude']].isnull().sum(1) == 0, :]
print('Length after removing ouliers', len(df))
# Show location coordinates after outlier removal
ax[1].plot(df.longitude, df.latitude, 'r.');
ax[1].set_title('After outlier removal');
ax[1].set_xlabel('Longitude');
ax[1].set_ylabel('Latitude');

# naive feature engineering

    r = KMeans(20, random_state=1)
    # Normalize (longitude, latitude) before K-means
    temp = df[['longitude', 'latitude']].copy()
    temp['longitude'] = (temp['longitude']-temp['longitude'].mean())/temp['longitude'].std()
    temp['latitude'] = (temp['latitude']-temp['latitude'].mean())/temp['latitude'].std()
    # Fit k-means and get labels
    r.fit(temp[['longitude', 'latitude']])
    df['labels'] = r.labels_

In [None]:
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day

plot the k-mean cluster

    # Plot results
    ncomp=20
    cols = sns.color_palette("Set2", n_colors=ncomp, desat=.5)
    cl = [cols[i] for i in r.labels_]
    area = 12
    ax[ix].scatter(df.longitude, df.latitude, s=area, c=cl, alpha=0.5);
    ax[ix].set_title('Number of components: ' + str(20))
    ax[ix].set_xlabel('Longitude')
    ax[ix].set_ylabel('Latitude')
    # Show aggregated volume and interest at each neighborhood
    x = df.groupby('labels')[['longitude','latitude','response']].mean().sort_values(['response'])
    x = pd.concat([x, df['labels'].value_counts()], axis=1).sort_values(['response'])
    cols = sns.color_palette("RdBu_r", ncomp)[::-1]
    for i in range(20):
        props = dict(boxstyle='round', facecolor=cols[i], alpha=0.8)
        ax[ix].text(x.longitude.values[i], x.latitude.values[i], 
                str(np.array(np.round(x.response.values,2), '|S8')[i])+'\n'+str(np.array(x['labels'].values, '|S8')[i]), 
                fontsize=9, verticalalignment='center', horizontalalignment='center', bbox=props);

In [None]:
# cheng, if you want to change feature just modify following staff
#num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
#             "num_photos", "num_features", "num_description_words",
#             "created_year", "created_month", "created_day"]
num_feats = ["bathrooms", "bedrooms", "num_photos","price",'mm']
X = df[num_feats]
y = df["interest_level"]
X.head()

# train model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [None]:
#clf=SVC(probability=True)
clf = RandomForestClassifier(n_estimators=1000)
#clf=MLPClassifier(hidden_layer_sizes=(10, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)

# make prediction

df = pd.read_json(open("../input/test.json", "r"))
print(df.shape)
r = KMeans(20, random_state=1)
# Normalize (longitude, latitude) before K-means
temp = df[['longitude', 'latitude']].copy()
temp['longitude'] = (temp['longitude']-temp['longitude'].mean())/temp['longitude'].std()
temp['latitude'] = (temp['latitude']-temp['latitude'].mean())/temp['latitude'].std()
# Fit k-means and get labels
r.fit(temp[['longitude', 'latitude']])
df['labels'] = r.labels_




df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
X = df[num_feats]

y = clf.predict_proba(X)

labels2idx = {label: i for i, label in enumerate(clf.classes_)}
labels2idx

sub = pd.DataFrame()
sub["listing_id"] = df["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv("submission_rf.csv", index=False)