In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline
# Always make it pretty.
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
X = df.drop('price',axis=1)
y = df['price']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
def get_performance_metrics(y_train, y_train_pred, y_test, y_test_pred, threshold=0.5):
    metric_names = ['AUC','Accuracy','Precision','Recall','f1-score']
    metric_values_train = [roc_auc_score(y_train, y_train_pred),
                    accuracy_score(y_train, y_train_pred>threshold),
                    precision_score(y_train, y_train_pred>threshold),
                    recall_score(y_train, y_train_pred>threshold),
                    f1_score(y_train, y_train_pred>threshold)
                   ]
    metric_values_test = [roc_auc_score(y_test, y_test_pred),
                    accuracy_score(y_test, y_test_pred>threshold),
                    precision_score(y_test, y_test_pred>threshold),
                    recall_score(y_test, y_test_pred>threshold),
                    f1_score(y_test, y_test_pred>threshold)
                   ]
    all_metrics = pd.DataFrame({'metrics':metric_names,
                                'train':metric_values_train,
                                'test':metric_values_test},columns=['metrics','train','test']).set_index('metrics')
    print(all_metrics)

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_train, y_train_pred, y_test, y_test_pred):
    roc_auc_train = roc_auc_score(y_train, y_train_pred)
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)

    roc_auc_test = roc_auc_score(y_test, y_test_pred)
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred)
    plt.figure()
    lw = 2
    plt.plot(fpr_train, tpr_train, color='green',
             lw=lw, label='ROC Train (AUC = %0.4f)' % roc_auc_train)
    plt.plot(fpr_test, tpr_test, color='darkorange',
             lw=lw, label='ROC Test (AUC = %0.4f)' % roc_auc_test)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [7]:
def train_test_model(clf, X_train, y_train, X_test):
    # Fit a model by providing X and y from training set
    clf.fit(X_train, y_train)

    # Make prediction on the training data
    y_train_pred = clf.predict(X_train)
#     p_train_pred = clf.predict_proba(X_train)[:,1]

    # Make predictions on test data
    y_test_pred = clf.predict(X_test)
#     p_test_pred = clf.predict_proba(X_test)[:,1]
    return y_train_pred,y_test_pred
#     sum((y_test_pred-y_test)^2)
    # print model results
#     get_performance_metrics(y_train, p_train_pred, y_test, p_test_pred)
#     plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred)

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

# Choose some parameter combinations to try

parameters = {'n_estimators': 300,
              'max_features': 'auto',
#               'criterion': 'gini',
              'learning_rate':0.1,
              'max_depth': 8,
              'min_samples_split': 5,
              'min_samples_leaf': 25,
              'random_state': 0,
#               'n_jobs': -1
              }

clf = GradientBoostingRegressor(**parameters)

# Fit a model by providing X and y from training set
# clf.fit(X_train, y_train)

# Train test model
y_train_pred,y_test_pred = train_test_model(clf, X_train, y_train, X_test)

In [15]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = GradientBoostingRegressor()

# Choose some parameter combinations to try
param_grid = {'n_estimators': [200,300,500],
              'max_features': ['auto'],
#               'criterion': 'gini',
              'learning_rate':[0.1],
              'max_depth': [5,8,10,20],
              'min_samples_split': [5,8,10,15],
              'min_samples_leaf': [8,10,20,25,30],
              'random_state': [0],
#               'n_jobs': [-1]
              }

# param_grid = {'n_estimators': [100,200], 
#               'max_features': ['auto'], 
#               'criterion': ['gini'],
#               'max_depth': [15,20,25], 
#               'min_samples_split': [2],
#               'min_samples_leaf': [2,10,20],
#               'n_jobs':[-1]
#              }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(roc_auc_score)

# Run the grid search
# read theory
grid_obj = GridSearchCV(clf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_obj = grid_obj.fit(np.array(X_train), np.array(y_train))

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)
y_train_pred,y_test_pred = train_test_model(clf, X_train, y_train, X_test)

In [19]:
clf

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=20, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=500,
             presort='auto', random_state=0, subsample=1.0, verbose=0,
             warm_start=False)

In [18]:
from math import sqrt
rss = 0
y_train = np.array(y_train)
for i in range(len(y_train)):
    rss += (y_train[i] - y_train_pred[i])**2
mse = rss/len(y_train)
rmse = sqrt(mse)
print('train rmse:',rmse)

rss = 0
y_test = np.array(y_test)
for i in range(len(y_test)):
    rss += (y_test[i] - y_test_pred[i])**2
mse = rss/len(y_test)
rmse = sqrt(mse)
print('test rmse:',rmse)

train rmse: 41.99237135796908
test rmse: 54.79600781865588


In [20]:
score_data = pd.read_csv('cleaned_score_data.csv')
X_train = X
y_train = y
X_test = score_data

In [21]:
from sklearn.ensemble import GradientBoostingRegressor

# Choose some parameter combinations to try

parameters = {'n_estimators': 500,
              'max_features': 'auto',
#               'criterion': 'gini',
              'learning_rate':0.1,
              'max_depth': 5,
              'min_samples_split': 5,
              'min_samples_leaf': 20,
              'random_state': 0,
#               'n_jobs': -1
              }

clf = GradientBoostingRegressor(**parameters)

# Fit a model by providing X and y from training set
# clf.fit(X_train, y_train)

# Train test model
y_train_pred,y_test_pred = train_test_model(clf, X_train, y_train, X_test)

In [22]:
rss = 0
y_train = np.array(y_train)
for i in range(len(y_train)):
    rss += (y_train[i] - y_train_pred[i])**2
mse = rss/len(y_train)
rmse = sqrt(mse)
print('train rmse:',rmse)

train rmse: 42.70191168991458


In [23]:
airbnb_id = pd.read_csv('scoringData.csv')['id']

In [24]:
result = pd.concat([airbnb_id,pd.DataFrame(y_test_pred)],axis=1)

In [25]:
print(result.head())

     id           0
0  4989  176.894153
1  5054  169.230008
2  5136  286.638572
3  6090  194.212375
4  6990   74.043194


In [26]:
result.to_csv("result——GBTGS.csv",index=False,sep=',')