In [1]:
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, auc
from sklearn.metrics import roc_auc_score, classification_report, log_loss
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn import model_selection, preprocessing, ensemble
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
import patsy
from scipy import sparse
from sklearn import neighbors, metrics, svm



In [4]:
features_to_use = ["bathrooms", "bedrooms", "latitude", "longitude", "price", "num_photos", "num_features", \
                    "description_len", "created_year", "created_month", "created_day", "listing_id", "created_hour"]

In [2]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.05
    param['max_depth'] = 5
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['alpha'] = 0.7
    param['seed'] = seed_val
    
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(param, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(param, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_X, y, test_size=0.3, random_state=3)

In [None]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

In [None]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=3)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

In [None]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = df_t.listing_id.values
out_df.to_csv("renthop_1.csv", index=False)

In [None]:
# out_df.to_csv("renthop_1.csv", index=False)

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=3)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
    print dev_index, val_index

In [None]:
xg_train = xgb.DMatrix(train_X, label=train_y)
xg_test = xgb.DMatrix(test_X, label=None)

param = {}
param['objective'] = 'multi:softmax'
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = 3


watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 500
bst = xgb.train(param, xg_train, num_round)
pred = bst.predict(xg_test)


param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
yprob = bst.predict(xg_test).reshape( pred.shape[0], 3)
ylabel = np.argmax(yprob, axis=1)


kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=3)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        cv_scores.append(log_loss(y_train, yprob))
        print(cv_scores)
        break