In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
import string
import re
import collections
from sklearn import preprocessing

%matplotlib inline

In [None]:
!pip install keras

In [None]:
# READ DATA 
train_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip')
test_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip')

# TRAIN DATA FEATURE ENGINEERING

In [None]:
# convert TARGET to the numeric
train_df['interest_level'] = train_df['interest_level'].apply(lambda x: 0 if x=='low' 
                                                      else 1 if x=='medium' 
                                                      else 2) 
# REMOVE UNNECESSARY WORDS FROM DESCRIPTION
train_df['description'] = train_df['description'].apply(lambda x: x.replace("<br />", ""))
train_df['description'] = train_df['description'].apply(lambda x: x.replace("br", ""))
train_df['description'] = train_df['description'].apply(lambda x: x.replace("<p><a", ""))

#basic features
train_df['rooms'] = train_df['bedrooms'] + train_df['bathrooms'] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))

# description contains email
regex = r'[\w\.-]+@[\w\.-]+'
train_df['has_email'] = train_df['description'].apply(lambda x: 1 if re.findall(regex, x) else 0)

# description contains phone
train_df['has_phone'] = train_df['description'].apply(lambda x:re.sub('['+string.punctuation+']', '', x).split())\
        .apply(lambda x: [s for s in x if s.isdigit()])\
        .apply(lambda x: len([s for s in x if len(str(s))==10]))\
        .apply(lambda x: 1 if x>0 else 0)

# CONVERT LOWER ALL OF WORDS
train_df[["features"]] = train_df[["features"]].apply(
    lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])

# TEST DATA FEATURE ENGINEERING

In [None]:
# REMOVE UNNECESSARY WORDS FROM DESCRIPTION
test_df['description'] = test_df['description'].apply(lambda x: x.replace("<br />", ""))
test_df['description'] = test_df['description'].apply(lambda x: x.replace("br", ""))
test_df['description'] = test_df['description'].apply(lambda x: x.replace("<p><a", ""))

#basic features
test_df['rooms'] = test_df['bedrooms'] + test_df['bathrooms'] 

# count of photos #
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# description contains email
regex = r'[\w\.-]+@[\w\.-]+'
test_df['has_email'] = test_df['description'].apply(lambda x: 1 if re.findall(regex, x) else 0)

# description contains phone
test_df['has_phone'] = test_df['description'].apply(lambda x:re.sub('['+string.punctuation+']', '', x).split())\
        .apply(lambda x: [s for s in x if s.isdigit()])\
        .apply(lambda x: len([s for s in x if len(str(s))==10]))\
        .apply(lambda x: 1 if x>0 else 0)

# CONVERT LOWER ALL OF WORDS
test_df[["features"]] = test_df[["features"]].apply(
    lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])

# MOST FREQUENT FEATURES EXTRACTION

In [None]:
feature_value_train = train_df['features'].tolist()
feature_value_test = test_df['features'].tolist()

feature_value_train
feature_value_test

feature_lst_train = []
feature_lst_test = []

for i in range(len(feature_value_train)):
    feature_lst_train += feature_value_train[i]
    
for i in range(len(feature_value_test)):
    feature_lst_test += feature_value_test[i]

uniq_feature_train = list(set(feature_lst_train))
uniq_feature_test = list(set(feature_lst_test))


# see the frequency of each feature
def most_common(lst):
    features = collections.Counter(lst)
    feature_value = features.keys()
    frequency = features.values()
    data = [('feature_value', feature_value),
            ('frequency', frequency),]    
    df = pd.DataFrame.from_dict(dict(data))
    return df.sort_values(by = 'frequency', ascending = False)

df_features_train = most_common(feature_lst_train)
df_features_test = most_common(feature_lst_test)


def newColumn(name, df, series):
    feature = pd.Series(0,df.index,name = name)# data : 0
    for row,word in enumerate(series):
        if name in word:
            feature.iloc[row] = 1
    df[name] = feature # feature : series ; value in series : 1 or 0
    return df

# select features based on frequency
facilities = ['elevator', 'cats allowed', 'hardwood floors', 'dogs allowed', 'doorman', 'dishwasher', 'no fee', 'laundry in building', 'fitness center']
for name in facilities:
    train_df = newColumn(name, train_df, train_df['features'])
    test_df = newColumn(name, test_df, test_df['features'])

# LABEL ECONDING FOR CATEGORICAL VARIABLES

In [None]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))

### LOGARITHMIC EXPRESSION TO THE PRICE COLUMN

In [None]:
train_df['price'] = np.log10(train_df['price'])
test_df['price'] = np.log10(test_df['price'])

# DROP UNNECESSARY COLUMNS

In [None]:
# TRAINING DATASET
train_df.drop('created', axis=1, inplace=True)
train_df.drop('description', axis=1, inplace=True)
train_df.drop('features', axis=1, inplace=True)
train_df.drop('photos', axis=1, inplace=True)


# TEST DATASET
test_df.drop('created', axis=1, inplace=True)
test_df.drop('description', axis=1, inplace=True)
test_df.drop('features', axis=1, inplace=True)
test_df.drop('photos', axis=1, inplace=True)

# REGRESSION FOR PRICE

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import optuna
import math

X = train_df.drop(['price'], axis = 1)
y = train_df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .3,
                                                    random_state = 5)

In [None]:
class Optimizer:
    def __init__(self, metric, trials=30):
        self.metric = metric
        self.trials = trials
        
    def objective(self, trial):
        model = create_model(trial)
        model.fit(X, y)
        preds = model.predict(X_test)
        return mean_absolute_error(y_test, preds)
            
    def optimize(self):
        study = optuna.create_study(direction="minimize")
        study.optimize(self.objective, n_trials=self.trials)
        return study

- XGB REGRESSOR OPTUNA PREDICTION

In [None]:
def create_model(trial):
    params = {
         'n_estimators': trial.suggest_int('n_estimators', 50, 300),
         'booster':trial.suggest_categorical('booster', ['gbtree', 'dart', 'gblinear']),
         'learning_rate':trial.suggest_loguniform("learning_rate", 0.001, 0.1),
         'max_depth':trial.suggest_int("max_depth", 3, 19),
         'subsample':trial.suggest_uniform("subsample", 0.0, 1.0),
         'colsample_bytree':trial.suggest_uniform("colsample_bytree", 0.0, 1.0),
    }
    model = xgb.XGBRegressor(**params)
    return model

optimizer = Optimizer('mae')
xgb_opt_study = optimizer.optimize()
xgb_opt_params = xgb_opt_study.best_params
xgb_opt = xgb.XGBRegressor(**xgb_opt_params)   # Model
xgb_opt.fit(X, y)
preds = xgb_opt.predict(X_test)

print("Number of finished trials: ", len(xgb_opt_study.trials))
print("Best trial:")
xgb_trial = xgb_opt_study.best_trial

print("  Value: {}".format(xgb_trial.value))
print("  Params: ")
for key, value in xgb_trial.params.items():
    print("    {}: {}".format(key, value))

- RANDOM FOREST OPTUNA PREDICTION

In [None]:
def create_model(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth',3 ,19),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"])
    }
    model = RandomForestRegressor(**params)
    return model

optimizer = Optimizer('mae')
rf_opt_study = optimizer.optimize()
rf_opt_params = rf_opt_study.best_params
rf_opt = RandomForestRegressor(**rf_opt_params)
rf_opt.fit(X, y)
preds = rf_opt.predict(X_test)

print("Number of finished trials: ", len(rf_opt_study.trials))
print("Best trial:")
rf_trial = rf_opt_study.best_trial

print("  Value: {}".format(rf_trial.value))
print("  Params: ")
for key, value in rf_trial.params.items():
    print("    {}: {}".format(key, value))

- LINEAR REGRESSION OPTUNA PREDICTION

In [None]:
def create_model(trial):
    params = {
        'copy_X': trial.suggest_categorical("copy_X", ["True", "False"]),
        'fit_intercept': trial.suggest_categorical("fit_intercept", ["True", "False"]),
        'n_jobs': trial.suggest_int('n_jobs',-1 ,3),
    }
    model = LinearRegression(**params)
    return model

optimizer = Optimizer('mae')
lr_opt_study = optimizer.optimize()
lr_opt_params = lr_opt_study.best_params
lr_opt = LinearRegression(**lr_opt_params)
lr_opt.fit(X, y)
preds = lr_opt.predict(X_test)

print("Number of finished trials: ", len(lr_opt_study.trials))
print("Best trial:")
lr_trial = lr_opt_study.best_trial

print("  Value: {}".format(lr_trial.value))
print("  Params: ")
for key, value in lr_trial.params.items():
    print("    {}: {}".format(key, value))

Let me show description of price

In [None]:
train_df['price'].describe()

* Pipeline for storing models

In [None]:
pipeline_models = []

xgb_default = xgb.XGBRegressor()
rf_default = RandomForestRegressor()
lr_default = LinearRegression()
svm_default = SVR()

models = [xgb_default, xgb_opt,
          rf_default, rf_opt,
          lr_default, lr_opt,
          svm_default]

model_names = ['XGB Regression (default)', 'XGB Regression (opt)', 
               'Random Forest (default)', 'Random Forest (opt)',
               'Linear Regression (default)', 'Linear Regression (opt)',
               'Support Vector Machine (default)']

## Assign each model to a pipeline
for name, model in zip(model_names,models):
    pipeline = ("Scaled_"+ name,
                Pipeline([("Scaler",StandardScaler()),
                          (name,model)
                         ]))
    pipeline_models.append(pipeline)

* Evaluate scores

* BEFORE PREDICTION INVERSE LOG10

In [None]:
train_df['price'] = 10 ** train_df['price']
test_df['price'] = 10 ** test_df['price']

In [None]:
train_df['price']

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate

## Create a dataframe to store all the models' cross validation score
evaluate = pd.DataFrame(columns=["model","cv_MAE", "cv_RMSE"])
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

## Encoded dataset
for name,model in pipeline_models:
    scores = cross_validate(model, X, y, cv=kfold, n_jobs=-1,
                         scoring=('neg_root_mean_squared_error', 'neg_mean_absolute_error'))
    
    row = evaluate.shape[0]
    evaluate.loc[row,"model"] = name
    evaluate.loc[row,"cv_MAE"] = round(abs(scores['test_neg_mean_absolute_error']).mean(), 3)
    evaluate.loc[row,"cv_RMSE"] = round(abs(scores['test_neg_root_mean_squared_error']).mean(), 3)

In [None]:
evaluate

- MAE score visualization

In [None]:
## Visualization
fig, ax = plt.subplots(figsize=(16,9))

## Encoded dataset
bar = sns.barplot(evaluate["model"], evaluate["cv_MAE"])
for rec in bar.patches:
    height = rec.get_height()
    ax.text(rec.get_x() + rec.get_width()/2, height*1.02,height,ha="center")
ax.set_title("Cross Validate Score (MAE)")
ax.set_xticklabels(evaluate["model"].to_list(),rotation =50)

- RMSE score visualization

In [None]:
## Visualization
fig, ax = plt.subplots(figsize=(16,9))

## Encoded dataset
bar = sns.barplot(evaluate["model"], evaluate["cv_RMSE"])
for rec in bar.patches:
    height = rec.get_height()
    ax.text(rec.get_x() + rec.get_width()/2, height*1.02,height,ha="center")
ax.set_title("Cross Validate Score (RMSE)")
ax.set_xticklabels(evaluate["model"].to_list(),rotation =50)