In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from plotnine import *

In [None]:
train = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip')
test = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip')

In [None]:
train.head(10)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
train['interest_level'].unique()

In [None]:
train.dtypes

In [None]:
train['n_photos'] = train['photos'].apply(len)
train['n_features'] = train['features'].apply(len)
train["n_description_words"] = train["description"].apply(lambda x: len(x.split(" ")))
train["created"] = pd.to_datetime(train["created"])
train["year"] = train["created"].dt.year
train["month"] = train["created"].dt.month
train["day"] = train["created"].dt.day

In [None]:
train['year'].nunique()

In [None]:
sns.distplot(train['bedrooms'])

In [None]:
x_all = train[train.bedrooms > 0]

In [None]:
ggplot(x_all, aes('bedrooms', 'price'))+stat_smooth(se=True, method='lowess' )

In [None]:
fig,axes = plt.subplots(1,2,sharey = True, figsize=(10,5))
sns.countplot(train['bedrooms'],hue=train['interest_level'],ax=axes[0])
sns.countplot(train['bathrooms'],hue=train['interest_level'],ax=axes[1])

In [None]:
fig,axes = plt.subplots(1,2,sharey = True, figsize=(10,7))
sns.boxplot(train['interest_level'],train['day'],ax = axes[0])
sns.violinplot(train['interest_level'],train['day'],ax = axes[1])

In [None]:
fig,axes = plt.subplots(1,2,sharey = True, figsize=(10,7))
sns.boxplot(train['interest_level'],train['month'],ax = axes[0])
sns.violinplot(train['interest_level'],train['month'],ax = axes[1])

In [None]:
numerical = ['bathrooms', 'bedrooms', 'month', 'day','n_photos', 'n_features','n_description_words', 'latitude','longitude','price']

In [None]:
corr = train[numerical].corr()

In [None]:
plt.subplots(figsize=(15,10))
sns.heatmap(corr,xticklabels=corr.columns,yticklabels = corr.columns, annot = True)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(10, 7))
for idx, feat in enumerate(numerical):
    ax = axes[int(idx / 4), idx % 4]
    sns.boxplot(x='interest_level', y=feat, data=train, ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel(feat)
fig.tight_layout();

In [None]:
X = train[numerical]
y = train['interest_level']
X.head(10)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.7, random_state = 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss,accuracy_score, confusion_matrix, classification_report

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1,random_state=0)
rf.fit(X_train, y_train)
y_val1 = rf.predict_proba(X_val)
log_loss(y_val, y_val1)

In [None]:
print(y_val1)

In [None]:
y_val2 = rf.predict(X_val)

In [None]:
print(y_val2)

In [None]:
print(accuracy_score(y_val,y_val2))

In [None]:
conf = confusion_matrix(y_val,y_val2)
ax = sns.heatmap(conf, annot = True)
ax.set(xlabel = 'predict',ylabel='True')

In [None]:
print(classification_report(y_val, y_val2))

In [None]:
test.head()

In [None]:
test['n_photos'] = test['photos'].apply(len)
test['n_features'] = test['features'].apply(len)
test["n_description_words"] = test["description"].apply(lambda x: len(x.split(" ")))
test["created"] = pd.to_datetime(test["created"])
test["year"] = test["created"].dt.year
test["month"] = test["created"].dt.month
test["day"] = test["created"].dt.day

In [None]:
result = rf.predict_proba(test[numerical])

In [None]:
result

In [None]:
test['high'] = result[:,0]
test['medium'] = result[:,2]
test['low'] = result[:,1]
test.head()

In [None]:
test.to_csv('rental.csv',index=False)