In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import seaborn as sns

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from pycm import *

In [2]:
def print_uniqueCounts(df):
    for name in df.columns:
        print(f'{name} : {df[name].nunique()}')

def to_categorical(df, col_name):
    df[col_name] = df[col_name].astype('category')
    
def to_str(df, col_name):
    df[col_name] = df[col_name].astype('str')
    
def fprint(comment, val):
    print(f'{comment}:{val}')
    
def transform_data(tfidf, df, col):
    features = tfidf.transform(df[col])
    return pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())

def featurize_str(df:pd.DataFrame, col:str, inplace:bool=False):
    stemmer = SnowballStemmer("english")
    stemmed_col = f'name_{col}'
    df[stemmed_col] = df[col].map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
    tvec = TfidfVectorizer(min_df=.0025, max_df=.1, stop_words='english', ngram_range=(1,2))
    tvec.fit(df[stemmed_col].dropna())
    features = transform_data(tvec, df, stemmed_col)
    df.drop(columns=stemmed_col, inplace=True)
    if inplace:
        index_name = df.index.name
        features[index_name] = df.index.values
        features.set_index(index_name, inplace=True)
        df.drop(columns=col, inplace=True)
        features = pd.merge(df, features, left_index=True, right_index=True)
    return features

def num_encode(df):
    cat_cols = df.select_dtypes(['category']).columns
    df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)
    
def to_numeric(df):
    cat_cols = df.select_dtypes(['int8', 'int16', 'int64']).columns
    df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('float64'))
    
Categories = ['Unknown', 'Free', 'Cheap', 'Average', 'Expensive', 'Luxury']

def price_category(price):
    cat = 0
    if price == 0.0:
        cat = 1
    elif price > 0.0 and price < 69.000000:
        cat = 2
    elif price > 69.000000 and price < 106.000000:
        cat = 3
    elif price > 106.000000 and price < 175.000000:
        cat = 4
    elif price > 175.000000:
        cat = 5
    return cat

In [3]:
#Load data
df = pd.read_csv('./data/AB_NYC_2019.csv')
df.set_index('id', inplace=True)

In [4]:
#Data prep
df['last_review'] = pd.to_datetime(df['last_review'], format='%Y-%m-%d')
df['last_review'].fillna(df['last_review'].mode()[0].date())
df['last_review_year'] = df['last_review'].dt.year
df['last_review_month'] = df['last_review'].dt.month
df['last_review_day'] = df['last_review'].dt.day

to_str(df, 'name')
to_str(df, 'host_name')

#Set label
df['price_category'] = df['price'].apply(lambda x: price_category(x))

to_categorical(df, 'neighbourhood_group')
to_categorical(df, 'minimum_nights')
to_categorical(df, 'room_type')
to_categorical(df, 'neighbourhood')
to_categorical(df, 'calculated_host_listings_count')
to_categorical(df, 'last_review_year')
to_categorical(df, 'last_review_month')
to_categorical(df, 'last_review_day')

#Fill-up null values
df['reviews_per_month'].fillna(df['reviews_per_month'].mean(), inplace=True)

In [5]:
#Drop unneeded columns
df.drop(columns=['last_review', 'host_id', 'price'], inplace=True)

In [6]:
#numerical encoding of categorical columns
num_encode(df)
to_numeric(df)

In [7]:
#Text Feature engineering
df = featurize_str(df, 'name', inplace=True)
df = featurize_str(df, 'host_name', inplace=True)
#df.drop(columns=['name', 'host_name'], inplace=True)

In [8]:
#Train / test split
from sklearn.model_selection import train_test_split
y = df.pop('price_category')
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

In [9]:
#Resampling
smote = SMOTE('minority')
X_sm, y_sm = smote.fit_sample(X_train, y_train)
X_train, y_train = X_sm, y_sm

In [10]:
# Train model
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclassova',
    'num_leaves': 100,
    'num_class': 6,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

# save model to file
gbm.save_model('model.txt')

[1]	valid_0's multi_logloss: 1.62234
Training until validation scores don't improve for 5 rounds
[2]	valid_0's multi_logloss: 1.56665
[3]	valid_0's multi_logloss: 1.51795
[4]	valid_0's multi_logloss: 1.48125
[5]	valid_0's multi_logloss: 1.44766
[6]	valid_0's multi_logloss: 1.41483
[7]	valid_0's multi_logloss: 1.3822
[8]	valid_0's multi_logloss: 1.35445
[9]	valid_0's multi_logloss: 1.32887
[10]	valid_0's multi_logloss: 1.30775
[11]	valid_0's multi_logloss: 1.28692
[12]	valid_0's multi_logloss: 1.26624
[13]	valid_0's multi_logloss: 1.24713
[14]	valid_0's multi_logloss: 1.22935
[15]	valid_0's multi_logloss: 1.21459
[16]	valid_0's multi_logloss: 1.19994
[17]	valid_0's multi_logloss: 1.18604
[18]	valid_0's multi_logloss: 1.17289
[19]	valid_0's multi_logloss: 1.16248
[20]	valid_0's multi_logloss: 1.15229
[21]	valid_0's multi_logloss: 1.14117
[22]	valid_0's multi_logloss: 1.13141
[23]	valid_0's multi_logloss: 1.12163
[24]	valid_0's multi_logloss: 1.11296
[25]	valid_0's multi_logloss: 1.10476


<lightgbm.basic.Booster at 0x1ce610a80b8>

In [15]:
# predict
pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred = []

for x in pred:
    y_pred.append(np.argmax(x))
    
# Print the precision and recall, among other metrics
print(metrics.classification_report(y_test, y_pred, target_names=Categories))

              precision    recall  f1-score   support

     Unknown       0.80      0.01      0.02       351
        Free       0.00      0.00      0.00         3
       Cheap       0.69      0.82      0.75      3597
     Average       0.51      0.47      0.49      3620
   Expensive       0.51      0.46      0.48      3445
      Luxury       0.68      0.73      0.71      3653

    accuracy                           0.61     14669
   macro avg       0.53      0.42      0.41     14669
weighted avg       0.60      0.61      0.59     14669

