In [None]:
import numpy as np 
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from pdpbox import pdp
from plotnine import *

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip', convert_dates=['created'])
test_data = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip', convert_dates=['created'])

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.head()

In [None]:
train_data["num_photos"] = train_data["photos"].apply(len)
train_data["num_features"] = train_data["features"].apply(len)
train_data["num_description_words"] = train_data["description"].apply(lambda x: len(x.split(" ")))
train_data["created"] = pd.to_datetime(train_data["created"])
train_data["created_year"] = train_data["created"].dt.year
train_data["created_month"] = train_data["created"].dt.month
train_data["created_day"] = train_data["created"].dt.day

In [None]:
num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day"]
X = train_data[num_feats]
y = train_data["interest_level"]
X.head()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

In [None]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,max_depth=15,bootstrap=True,random_state=42)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [None]:
print(clf.score(X_train,y_train))
print(clf.score(X_val,y_val))

In [None]:
def get_sample(df,n):
    
    idxs = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idxs].copy()

In [None]:
x_all = get_sample(X_train[X_train.bedrooms > 0], 60000)

In [None]:
ggplot(x_all, aes('bedrooms', 'price'))+stat_smooth(se=True, method='lowess' )

# **PDP**

In [None]:
def plot_pdp(feat, clusters=None, feat_name=None):
    feat_name = feat_name or feat
    p = pdp.pdp_isolate(clf, x_all, x_all.columns, feat)
    return pdp.pdp_plot(p, feat_name, plot_lines=True,
                        cluster=clusters is not None,
                        n_cluster_centers=clusters)

In [None]:
plot_pdp('bedrooms')

# **Extrapolation**

In [None]:
df_ext = X.copy()
df_ext['is_valid'] = 1

In [None]:
df_ext.is_valid[:5021]=0

In [None]:
df_ext.head()

In [None]:
print(X.shape,X_train.shape, X_val.shape)

In [None]:
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(df_ext, y, test_size=0.33)

In [None]:
print(X.shape,X_train_1.shape, X_val_1.shape)

In [None]:
df_ext.info()

In [None]:
X_df_ext = df_ext[num_feats]
y_df_ext = df_ext["is_valid"]
X_df_ext.head()

In [None]:
m = RandomForestClassifier(n_estimators=30, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_df_ext, y_df_ext);
m.oob_score_

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'columns':df.columns, 'importance':m.feature_importances_}
                       ).sort_values('importance', ascending=False)

In [None]:
fi= rf_feat_importance(m,X_df_ext); fi[:4]

In [None]:
df_ext.info()

In [None]:
df_ext.drop(['created_month'], axis=1, inplace=True)


In [None]:
X_df_ext.drop(['created_month'], axis=1, inplace=True)

In [None]:
X_train_1.drop(['created_month'], axis=1, inplace=True)
X_val_1.drop(['created_month'], axis=1, inplace=True)

In [None]:
t = RandomForestClassifier(n_estimators=100, n_jobs=-1,max_depth=15,bootstrap=True,random_state=42)
t.fit(X_train_1, y_train_1)
y_val_pred = t.predict_proba(X_val_1)
log_loss(y_val_1, y_val_pred)

In [None]:
print(t.score(X_train_1,y_train_1))
print(t.score(X_val_1,y_val_1))