# Destination Country for Airbnb's New User 

**GOAL OF ANALYSIS**

To develop a predictive model to predict the fare prices of taxi in New York City.

In [1]:
# Importing required Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read Test and Train
airbnb_train  = pd.read_csv("airbnb_new.train.csv")
airbnb_test = pd.read_csv("airbnb_new.test.csv")


In [3]:
# Copy of original data
train_original = airbnb_train.copy()
test_original = airbnb_test.copy()

In [4]:
airbnb_train.columns

Index(['age', 'country_destination', 'created_month', 'created_date',
       'first.active_month', 'first.active_date', 'signup_methodFACEBOOK',
       'signup_methodGOOGLE', 'signup_methodWEIBO', 'languageCS',
       ...
       'first_browserYANDEX.BROWSER', 'created_year2011', 'created_year2012',
       'created_year2013', 'created_year2014', 'first.active_year2010',
       'first.active_year2011', 'first.active_year2012',
       'first.active_year2013', 'first.active_year2014'],
      dtype='object', length=131)

In [5]:
"dimension of train data: {}".format(airbnb_train.shape), "dimension of test data: {}".format(airbnb_test.shape)


('dimension of train data: (213451, 131)',
 'dimension of test data: (62095, 130)')

**Our target varibale is "country_destination"**

# Model Building

Splitting test train 

In [6]:
# Split data for independent and target varibale.
X = airbnb_train.loc[:, airbnb_train.columns != 'country_destination']
y = airbnb_train['country_destination']

In [10]:
# Train and test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=airbnb_train['country_destination'], random_state=0)

In [23]:
"dimension of X_train data: {}".format(X_train.shape), "dimension of X_test data: {}".format(X_test.shape)


('dimension of X_train data: (160088, 130)',
 'dimension of X_test data: (53363, 130)')

In [21]:
# Check for start
y_train.value_counts(normalize=True)

8     0.583473
12    0.292227
10    0.047286
5     0.023531
7     0.013280
6     0.010888
4     0.010538
2     0.006690
3     0.004972
9     0.003573
1     0.002524
11    0.001018
Name: country_destination, dtype: float64

In [17]:
y_test.value_counts(normalize=True)

8     0.583475
12    0.292225
10    0.047299
5     0.023537
7     0.013286
6     0.010888
4     0.010532
2     0.006690
3     0.004966
9     0.003561
1     0.002530
11    0.001012
Name: country_destination, dtype: float64

In [22]:
# Feature Scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Logistic regression

In [19]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier = classifier.fit(X_train, y_train)
# Checking default parameters
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [21]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[    0,     0,     0,     0,     0,     0,     0,   119,     0,
            0,     0,    16],
       [    0,     0,     0,     0,     0,     0,     0,   282,     0,
            0,     0,    75],
       [    0,     0,     0,     0,     0,     0,     0,   208,     0,
            0,     0,    57],
       [    0,     0,     0,     0,     0,     0,     0,   471,     0,
            0,     0,    91],
       [    0,     0,     0,     0,     0,     0,     0,  1037,     0,
            0,     0,   219],
       [    0,     0,     0,     0,     0,     0,     0,   464,     0,
            0,     0,   117],
       [    0,     0,     0,     0,     0,     0,     0,   602,     0,
            0,     0,   107],
       [    0,     0,     0,     0,     0,     0,     0, 29015,     0,
            0,     0,  2121],
       [    0,     0,     0,     0,     0,     0,     0,   169,     0,
            0,     0,    21],
       [    0,     0,     0,     0,     0,     0,     0,  2171,     0,
            0,     0

In [22]:
print("Training set accuracy: {:.3f}".format(classifier.score(X_train, y_train)))
print("Test set accuracy: {:.3f}".format(classifier.score(X_test, y_test)))

Training set accuracy: 0.590
Test set accuracy: 0.592


In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

NameError: name 'y_pred' is not defined

### KNN classifier 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 10
neighbors_settings = range(1, 12)

for n_neighbors in neighbors_settings:
    # build the model
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(knn.score(X_train, y_train))
    # record test set accuracy
    test_accuracy.append(knn.score(X_test, y_test))

plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
plt.savefig('knn_compare_model')

In [None]:
knn = KNeighborsClassifier(n_neighbors= 11)
knn.fit(X_train, y_train)

print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))

In [None]:
y_pred = knn.predict(X_test)
print ('\n clasification report:\n\n', classification_report(y_test, y_pred))

### Naive Bayes classifier 

In [None]:
 # training a Naive Bayes classifier 
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

[train_data, test_data, train_label, test_label] = train_test_split(train_users_scaled, label_df, test_size = 0.3, random_state = 817)

gnb = GaussianNB()
gnb.fit(train_data, train_label.values.ravel())

print('Accuracy score for Navie Bayes:')
print(gnb.score(test_data, test_label))

In [None]:
# Predict on test set
gnb_predictions = gnb.predict(X_test) 

In [None]:
# accuracy on X_test 
accuracy = gnb.score(X_test, y_test) 
print accuracy 

In [None]:
from sklearn.metrics import confusion_matrix 
# creating a confusion matrix 
cm = confusion_matrix(y_test, gnb_predictions) 
print '\n confussion matrix:\n',confusion_matrix(y_test, gnb_predictions)

In [None]:
# wclf = SVC(kernel='linear', C= 1, class_weight={1: 10})
# wclf.fit(X, y)
# weighted_prediction = wclf.predict(X_test)

print 'Accuracy:', accuracy_score(y_test, gnb_predictions)

In [None]:
print 'Recall:', recall_score(y_test, gnb_predictions,
                              average='weighted')
print 'Precision:', precision_score(y_test, gnb_predictions,
                                    average='weighted')
print 'F1 score:', f1_score(y_test, gnb_predictions,average='weighted')

In [None]:
from sklearn.metrics import classification_report
print '\n clasification report:\n', classification_report(y_test, gnb_predictions)

### Decision Tree 

In [31]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.932
Accuracy on test set: 0.508


In [51]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=50,min_samples_split=100,random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.658
Accuracy on test set: 0.611


In [11]:
from sklearn.model_selection import GridSearchCV

# Provide range for max_depth from 1 to 20 with an interval of 2 and  
# min_samples_split from 1 to 100 with an interval of 10 

paramgrid = {'max_depth': list(range(1, 20, 2)),'min_samples_split': list(range(10, 100, 10)) }

In [14]:
# instantiate and fit the grid
tree = DecisionTreeClassifier(random_state=0)
grid_search=GridSearchCV(tree,paramgrid, scoring= 'accuracy',return_train_score=False)

In [15]:
# fit the grid with data
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# examine the best model
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [33]:
tree = DecisionTreeClassifier(max_depth=1,min_samples_split=10,random_state=0)
tree.fit(X_train, y_train)X_test, y_testX_test, y_test

print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.583
Accuracy on test set: 0.583


In [43]:
loan_features =  airbnb_train.columns
print("Feature importances:\n{}".format(tree.feature_importances_))

Feature importances:
[3.34196607e-01 4.23069541e-02 7.16802644e-02 3.87686127e-02
 7.32587935e-02 1.23219554e-01 3.11771411e-04 0.00000000e+00
 3.26344149e-04 2.33588458e-04 2.45160161e-03 2.00295698e-04
 8.30186892e-03 2.12352500e-03 1.95040724e-04 2.70961003e-03
 0.00000000e+00 8.17081059e-05 0.00000000e+00 0.00000000e+00
 9.86231874e-04 7.86768882e-04 2.58072459e-03 4.16647486e-04
 2.40561348e-04 2.16888568e-04 1.06247980e-03 1.15140913e-03
 7.40587149e-04 1.42221150e-04 3.53634544e-04 3.44883092e-03
 1.47005616e-02 4.72828783e-03 4.71416664e-03 3.07567433e-03
 7.45443362e-03 7.93176008e-03 7.98859032e-03 4.53074727e-03
 3.04542126e-03 8.27689804e-05 4.37028943e-03 3.57360139e-04
 3.80890329e-03 1.17482608e-03 7.43397128e-03 0.00000000e+00
 6.29896966e-04 3.35181169e-04 7.56030113e-03 1.06489100e-03
 2.91622955e-03 0.00000000e+00 1.29906946e-03 0.00000000e+00
 2.41132492e-04 4.90802081e-04 7.54078565e-03 4.15887986e-03
 8.22840203e-03 1.12880811e-02 4.27392685e-03 4.86238374e-03
 2.

In [None]:
print 'Recall:', recall_score(y_test, gnb_predictions,
                              average='weighted')
print 'Precision:', precision_score(y_test, gnb_predictions,
                                    average='weighted')
print 'F1 score:', f1_score(y_test, gnb_predictions,average='weighted')

In [45]:
from sklearn.metrics import classification_report
print ('\n clasification report:\n', classification_report(y_test, gnb_predictions))

NameError: name 'gnb_predictions' is not defined

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 
forest_class = RandomForestClassifier(random_state = 42)

n_estimators = [100, 500]
min_samples_split = [10, 20]

param_grid_forest = {'n_estimators' : n_estimators, 'min_samples_split' : min_samples_split}

rand_search_forest = GridSearchCV(forest_class, param_grid_forest, cv = 4, refit = True,
                                 n_jobs = -1, verbose=2)

rand_search_forest.fit(X_train,y_train)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
random_estimator = rand_search_forest.best_estimator_

y_pred_random_estimator = random_estimator.predict_proba(X_train)

In [None]:
y_pred = random_estimator.predict_proba(X_test) 

# We take the 5 highest probabilities for each person
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

# Generating a csv file with the predictions 
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('output_randomForest.csv',index=False)