In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline
# Always make it pretty.
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
X = df.drop('price',axis=1)
y = df['price']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
def get_performance_metrics(y_train, y_train_pred, y_test, y_test_pred, threshold=0.5):
    metric_names = ['AUC','Accuracy','Precision','Recall','f1-score']
    metric_values_train = [roc_auc_score(y_train, y_train_pred),
                    accuracy_score(y_train, y_train_pred>threshold),
                    precision_score(y_train, y_train_pred>threshold),
                    recall_score(y_train, y_train_pred>threshold),
                    f1_score(y_train, y_train_pred>threshold)
                   ]
    metric_values_test = [roc_auc_score(y_test, y_test_pred),
                    accuracy_score(y_test, y_test_pred>threshold),
                    precision_score(y_test, y_test_pred>threshold),
                    recall_score(y_test, y_test_pred>threshold),
                    f1_score(y_test, y_test_pred>threshold)
                   ]
    all_metrics = pd.DataFrame({'metrics':metric_names,
                                'train':metric_values_train,
                                'test':metric_values_test},columns=['metrics','train','test']).set_index('metrics')
    print(all_metrics)

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_train, y_train_pred, y_test, y_test_pred):
    roc_auc_train = roc_auc_score(y_train, y_train_pred)
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)

    roc_auc_test = roc_auc_score(y_test, y_test_pred)
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred)
    plt.figure()
    lw = 2
    plt.plot(fpr_train, tpr_train, color='green',
             lw=lw, label='ROC Train (AUC = %0.4f)' % roc_auc_train)
    plt.plot(fpr_test, tpr_test, color='darkorange',
             lw=lw, label='ROC Test (AUC = %0.4f)' % roc_auc_test)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [7]:
def train_test_model(clf, X_train, y_train, X_test):
    # Fit a model by providing X and y from training set
    clf.fit(X_train, y_train)

    # Make prediction on the training data
    y_train_pred = clf.predict(X_train)
#     p_train_pred = clf.predict_proba(X_train)[:,1]

    # Make predictions on test data
    y_test_pred = clf.predict(X_test)
#     p_test_pred = clf.predict_proba(X_test)[:,1]
    return y_train_pred,y_test_pred
#     sum((y_test_pred-y_test)^2)
    # print model results
#     get_performance_metrics(y_train, p_train_pred, y_test, p_test_pred)
#     plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred)

In [25]:
# http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
from sklearn.neural_network import MLPRegressor

# Choose some parameter combinations to try
parameters = {
    'solver':'adam', 
    'activation':'relu',
    'alpha':1e-3, #increase alpha->increase penalty :: http://scikit-learn.org/stable/auto_examples/neural_networks/plot_mlp_alpha.html#sphx-glr-auto-examples-neural-networks-plot-mlp-alpha-py
    'hidden_layer_sizes':(100,50), 
    'learning_rate':'adaptive',
    'random_state':1
    }
clf = MLPRegressor(**parameters)

# Train test model
y_train_pred,y_test_pred = train_test_model(clf, X_train, y_train, X_test)

In [26]:
from math import sqrt
rss = 0
y_train = np.array(y_train)
for i in range(len(y_train)):
    rss += (y_train[i] - y_train_pred[i])**2
mse = rss/len(y_train)
rmse = sqrt(mse)
print('train rmse:',rmse)

rss = 0
y_test = np.array(y_test)
for i in range(len(y_test)):
    rss += (y_test[i] - y_test_pred[i])**2
mse = rss/len(y_test)
rmse = sqrt(mse)
print('test rmse:',rmse)

train rmse: 64.56713881098233
test rmse: 64.10542949054316


In [22]:
score_data = pd.read_csv('cleaned_score_data.csv')
X_train = X
y_train = y
X_test = score_data

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

# Choose some parameter combinations to try

parameters = {'n_estimators': 250,
              'max_features': 'auto',
#               'criterion': 'gini',
              'learning_rate':0.1,
              'max_depth': 8,
              'min_samples_split': 5,
              'min_samples_leaf': 20,
              'random_state': 0,
#               'n_jobs': -1
              }

clf = GradientBoostingRegressor(**parameters)

# Fit a model by providing X and y from training set
# clf.fit(X_train, y_train)

# Train test model
y_train_pred,y_test_pred = train_test_model(clf, X_train, y_train, X_test)

In [24]:
rss = 0
y_train = np.array(y_train)
for i in range(len(y_train)):
    rss += (y_train[i] - y_train_pred[i])**2
mse = rss/len(y_train)
rmse = sqrt(mse)
print('train rmse:',rmse)

train rmse: 36.48928908975434


In [25]:
airbnb_id = pd.read_csv('scoringData.csv')['id']

In [26]:
result = pd.concat([airbnb_id,pd.DataFrame(y_test_pred)],axis=1)

In [27]:
print(result.head())

     id           0
0  4989  170.666176
1  5054  171.092851
2  5136  260.411259
3  6090  231.937689
4  6990   76.821501


In [28]:
result.to_csv("result——GBT.csv",index=False,sep=',')