__LOGISTIC REGRESSION ON AIRBNB DATA__

__CAPSTONE PROJECT #1 MACHINE LEARNING APPLICATION__


In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import sklearn.model_selection

c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

def points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light, 
                cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
    h = .02
    X=np.concatenate((Xtr, Xte))
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    #plt.figure(figsize=(10,6))
    if zfunc:
        p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
        p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z=zfunc(p0, p1)
    else:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    ZZ = Z.reshape(xx.shape)
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)
    if predicted:
        showtr = clf.predict(Xtr)
        showte = clf.predict(Xte)
    else:
        showtr = ytr
        showte = yte
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, 
               s=psize, alpha=alpha,edgecolor="k")
    # and testing points
    ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold, 
               alpha=alpha, marker="s", s=psize+10)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax,xx,yy

def points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light, 
                     cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):
    ax,xx,yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False, 
                           colorscale=colorscale, cdiscrete=cdiscrete, 
                           psize=psize, alpha=alpha, predicted=True) 
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)
    cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)
    plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14, axes=ax)
    return ax 

print("Loaded")
airbnb = pd.read_csv('data/dfmergefinal.csv')

Loaded


In [2]:
airbnb.shape

(3340486, 30)

In [3]:
airbnb.head()

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance,action,action_type,action_detail,device_type,secs_elapsed,age_bucket,population_in_thousands,year
0,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,62.0,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,other,,,,,,,lookup,,,Windows Desktop,319.0,,,
1,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,62.0,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,other,,,,,,,search_results,click,view_search_results,Windows Desktop,67753.0,,,
2,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,62.0,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,other,,,,,,,lookup,,,Windows Desktop,301.0,,,
3,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,62.0,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,other,,,,,,,search_results,click,view_search_results,Windows Desktop,22141.0,,,
4,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,62.0,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,other,,,,,,,lookup,,,Windows Desktop,435.0,,,


In [4]:
country_destination = airbnb[['country_destination']]

In [5]:
country_destination.head()

Unnamed: 0,country_destination
0,other
1,other
2,other
3,other
4,other


In [15]:
airbnb_df = airbnb.drop(['country_destination'], axis=1)

In [16]:
airbnb_df.shape

(3340486, 29)

In [17]:
airbnb_df['age'] = pd.cut(airbnb_df.age,bins=[0,2,10,17,35,65,150],labels=['Toddler','Child','Young Adult','Adult', 
                                                                  'Middle Age','Elderly'])

In [18]:
airbnb_df.head()

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance,action,action_type,action_detail,device_type,secs_elapsed,age_bucket,population_in_thousands,year
0,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,319.0,,,
1,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,search_results,click,view_search_results,Windows Desktop,67753.0,,,
2,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,301.0,,,
3,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,search_results,click,view_search_results,Windows Desktop,22141.0,,,
4,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,435.0,,,


In [19]:
airbnb_df.shape

(3340486, 29)

In [20]:
airbnb_df = airbnb_df.drop(['user_id', 'date_account_created','timestamp_first_active', 'date_first_booking'], axis=1)

In [21]:
airbnb_df.shape

(3340486, 25)

In [22]:
airbnb_df.head()

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance,action,action_type,action_detail,device_type,secs_elapsed,age_bucket,population_in_thousands,year
0,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,319.0,,,
1,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,search_results,click,view_search_results,Windows Desktop,67753.0,,,
2,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,301.0,,,
3,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,search_results,click,view_search_results,Windows Desktop,22141.0,,,
4,MALE,Middle Age,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,435.0,,,


In [23]:
#I don't think this is clear what it means. So, I will drop this as well
airbnb_df['signup_flow'].value_counts(dropna=False)

0     2650986
25     263652
12     219121
23     133799
24      69657
8        3225
21         46
Name: signup_flow, dtype: int64

In [24]:
airbnb_df = airbnb_df.drop(['signup_flow'], axis=1)

In [25]:
airbnb_df.shape

(3340486, 24)

In [26]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [27]:
airbnb_df['secs_elapsed'].isna().sum()

47250

In [28]:
airbnb_df['secs_elapsed'].fillna(airbnb_df['secs_elapsed'].mean(), inplace=True)

In [29]:
airbnb_df['secs_elapsed'].isna().sum()

0

In [30]:
airbnb_df[['secs_elapsed']] = StandardScaler().fit_transform(airbnb_df[['secs_elapsed']])

In [31]:
airbnb_df.shape

(3340486, 24)

In [32]:
airbnb_df['secs_elapsed']

0         -2.240747e-01
1          5.017683e-01
2         -2.242685e-01
3          1.081192e-02
4         -2.228261e-01
5         -1.445952e-01
6         -2.262705e-01
7         -2.185637e-01
8         -3.170196e-03
9         -2.201567e-01
10         4.105024e-01
11        -2.264858e-01
12        -2.124499e-01
13         5.785462e-01
14        -2.247636e-01
15        -1.399560e-01
16        -2.255386e-01
17         1.662284e+00
18        -2.265827e-01
19        -2.109860e-01
20         6.685850e-01
21        -2.236334e-01
22        -2.007281e-01
23        -2.226001e-01
24        -1.193003e-01
25        -2.269702e-01
26        -2.060239e-01
27         6.579504e-01
28        -2.264105e-01
29        -2.202644e-01
               ...     
3340456   -2.273792e-01
3340457   -2.268087e-01
3340458    1.129661e+00
3340459   -2.226754e-01
3340460   -2.236765e-01
3340461    1.499461e+00
3340462   -2.180147e-01
3340463   -1.200430e-01
3340464   -2.204258e-01
3340465    2.628129e-01
3340466   -2.028

In [33]:
airbnb_df.head()

Unnamed: 0,gender,age,signup_method,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance,action,action_type,action_detail,device_type,secs_elapsed,age_bucket,population_in_thousands,year
0,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,-0.224075,,,
1,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,search_results,click,view_search_results,Windows Desktop,0.501768,,,
2,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,-0.224268,,,
3,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,search_results,click,view_search_results,Windows Desktop,0.010812,,,
4,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,,,,,,lookup,,,Windows Desktop,-0.222826,,,


In [34]:
pd.get_dummies(airbnb_df, dummy_na=True)

Unnamed: 0,lat_destination,lng_destination,distance_km,destination_km2,language_levenshtein_distance,secs_elapsed,age_bucket,population_in_thousands,year,gender_FEMALE,gender_MALE,gender_OTHER,gender_nan,age_Toddler,age_Child,age_Young Adult,age_Adult,age_Middle Age,age_Elderly,age_nan,signup_method_basic,signup_method_facebook,signup_method_google,signup_method_nan,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,language_hu,language_id,language_is,language_it,language_ja,language_ko,language_nl,language_no,language_pl,language_pt,language_ru,language_sv,language_th,language_tr,language_zh,language_nan,affiliate_channel_api,affiliate_channel_content,...,action_detail_send_message,action_detail_set_password,action_detail_set_password_page,action_detail_signup,action_detail_signup_login_page,action_detail_signup_modal,action_detail_similar_listings,action_detail_special_offer_field,action_detail_terms_and_privacy,action_detail_toggle_archived_thread,action_detail_toggle_starred_thread,action_detail_translate_listing_reviews,action_detail_translations,action_detail_trip_availability,action_detail_unavailable_dates,action_detail_update_listing,action_detail_update_listing_description,action_detail_update_user,action_detail_update_user_profile,action_detail_user_friend_recommendations,action_detail_user_listings,action_detail_user_profile,action_detail_user_profile_content_update,action_detail_user_reviews,action_detail_user_social_connections,action_detail_user_tax_forms,action_detail_user_wishlists,action_detail_view_listing,action_detail_view_search_results,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips,action_detail_nan,device_type_Android App Unknown Phone/Tablet,device_type_Android Phone,device_type_Blackberry,device_type_Chromebook,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Opera Phone,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch,device_type_nan
0,,,,,,-2.240747e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,,,,,,5.017683e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,,,,,,-2.242685e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,,,,,,1.081192e-02,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,,,,,,-2.228261e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5,,,,,,-1.445952e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,,,,,,-2.262705e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
7,,,,,,-2.185637e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,,,,,,-3.170196e-03,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
9,,,,,,-2.201567e-01,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [35]:
airbnb_df = airbnb_df.drop(['lat_destination', 'lng_destination','distance_km', 'destination_km2',
                           'language_levenshtein_distance'], axis=1)

In [36]:
airbnb_df.shape

(3340486, 19)

In [38]:
airbnb_df.head()

Unnamed: 0,gender,age,signup_method,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,destination_language,action,action_type,action_detail,device_type,secs_elapsed,age_bucket,population_in_thousands,year
0,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,lookup,,,Windows Desktop,-0.224075,,,
1,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,search_results,click,view_search_results,Windows Desktop,0.501768,,,
2,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,lookup,,,Windows Desktop,-0.224268,,,
3,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,search_results,click,view_search_results,Windows Desktop,0.010812,,,
4,MALE,Middle Age,basic,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,,lookup,,,Windows Desktop,-0.222826,,,


In [39]:
airbnb_df.shape

(3340486, 19)

In [40]:
airbnb_df_dummied = pd.get_dummies(airbnb_df, dummy_na=True)

In [41]:
airbnb_df_dummied.shape

(3340486, 549)

In [42]:
airbnb_df_dummied.head()

Unnamed: 0,secs_elapsed,age_bucket,population_in_thousands,year,gender_FEMALE,gender_MALE,gender_OTHER,gender_nan,age_Toddler,age_Child,age_Young Adult,age_Adult,age_Middle Age,age_Elderly,age_nan,signup_method_basic,signup_method_facebook,signup_method_google,signup_method_nan,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,language_hu,language_id,language_is,language_it,language_ja,language_ko,language_nl,language_no,language_pl,language_pt,language_ru,language_sv,language_th,language_tr,language_zh,language_nan,affiliate_channel_api,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,...,action_detail_send_message,action_detail_set_password,action_detail_set_password_page,action_detail_signup,action_detail_signup_login_page,action_detail_signup_modal,action_detail_similar_listings,action_detail_special_offer_field,action_detail_terms_and_privacy,action_detail_toggle_archived_thread,action_detail_toggle_starred_thread,action_detail_translate_listing_reviews,action_detail_translations,action_detail_trip_availability,action_detail_unavailable_dates,action_detail_update_listing,action_detail_update_listing_description,action_detail_update_user,action_detail_update_user_profile,action_detail_user_friend_recommendations,action_detail_user_listings,action_detail_user_profile,action_detail_user_profile_content_update,action_detail_user_reviews,action_detail_user_social_connections,action_detail_user_tax_forms,action_detail_user_wishlists,action_detail_view_listing,action_detail_view_search_results,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips,action_detail_nan,device_type_Android App Unknown Phone/Tablet,device_type_Android Phone,device_type_Blackberry,device_type_Chromebook,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Opera Phone,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch,device_type_nan
0,-0.224075,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0.501768,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,-0.224268,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0.010812,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,-0.222826,,,,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [43]:
airbnb_df_dummied['age_bucket'].value_counts(dropna=False)

NaN    3340486
Name: age_bucket, dtype: int64

In [44]:
airbnb_df_dummied['population_in_thousands'].value_counts(dropna=False)

NaN    3340486
Name: population_in_thousands, dtype: int64

In [45]:
airbnb_df_dummied['year'].value_counts(dropna=False)

NaN    3340486
Name: year, dtype: int64

In [46]:
airbnb_dummied = airbnb_df_dummied.drop(['age_bucket', 'population_in_thousands', 'year'], axis=1)

In [47]:
airbnb_dummied.shape

(3340486, 546)

In [48]:
airbnb_dummied.head()

Unnamed: 0,secs_elapsed,gender_FEMALE,gender_MALE,gender_OTHER,gender_nan,age_Toddler,age_Child,age_Young Adult,age_Adult,age_Middle Age,age_Elderly,age_nan,signup_method_basic,signup_method_facebook,signup_method_google,signup_method_nan,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,language_fr,language_hu,language_id,language_is,language_it,language_ja,language_ko,language_nl,language_no,language_pl,language_pt,language_ru,language_sv,language_th,language_tr,language_zh,language_nan,affiliate_channel_api,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,affiliate_channel_seo,affiliate_channel_nan,affiliate_provider_baidu,...,action_detail_send_message,action_detail_set_password,action_detail_set_password_page,action_detail_signup,action_detail_signup_login_page,action_detail_signup_modal,action_detail_similar_listings,action_detail_special_offer_field,action_detail_terms_and_privacy,action_detail_toggle_archived_thread,action_detail_toggle_starred_thread,action_detail_translate_listing_reviews,action_detail_translations,action_detail_trip_availability,action_detail_unavailable_dates,action_detail_update_listing,action_detail_update_listing_description,action_detail_update_user,action_detail_update_user_profile,action_detail_user_friend_recommendations,action_detail_user_listings,action_detail_user_profile,action_detail_user_profile_content_update,action_detail_user_reviews,action_detail_user_social_connections,action_detail_user_tax_forms,action_detail_user_wishlists,action_detail_view_listing,action_detail_view_search_results,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips,action_detail_nan,device_type_Android App Unknown Phone/Tablet,device_type_Android Phone,device_type_Blackberry,device_type_Chromebook,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Opera Phone,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch,device_type_nan
0,-0.224075,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0.501768,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,-0.224268,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0.010812,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,-0.222826,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [50]:
# Split the data into a training and test set.
#Xlr, Xtestlr, ylr, ytestlr = train_test_split(airbnb_dummied.values, 
                                            #country_destination.values, random_state=5)

# AJS: define the first parameter and see what it is
X = airbnb_dummied.values
print("X: ", type(X), X.shape)

X:  <class 'numpy.ndarray'> (3340486, 546)


In [51]:
# what is the shape of the dataframe used to build X?
airbnb_dummied.shape

(3340486, 546)

In [52]:
# define the second parameter, investigate its type and shape
# and also compare its shape with that of the dataframe it comes from ...
y = country_destination.values

print("y: ", type(y), y.shape)
print("country_destination shape: ", country_destination.shape)

y:  <class 'numpy.ndarray'> (3340486, 1)
country_destination shape:  (3340486, 1)


In [53]:
y

array([['other'],
       ['other'],
       ['other'],
       ...,
       ['US'],
       ['US'],
       ['US']], dtype=object)

In [54]:
Xlr, Xtestlr, ylr, ytestlr = train_test_split(X, y, train_size=0.8, random_state=42)

In [55]:
print("\n")
print("Xlr:", Xlr, type(Xlr), Xlr.shape, len(Xlr)) #TrainX


print("\n")
print("Xtestlr", Xtestlr, type(Xtestlr), Xtestlr.shape, len(Xtestlr)) #TestX


print("\n")
print("ylr", ylr, type(ylr), ylr.shape, len(ylr)) #Trainy


print("\n")
print("ytestlr", ytestlr, type(ytestlr), ytestlr.shape, len(ytestlr)) #Testy



Xlr: [[-0.09229408  0.          1.         ...  1.          0.
   0.        ]
 [-0.08862364  1.          0.         ...  0.          0.
   0.        ]
 [-0.22647505  0.          1.         ...  0.          0.
   0.        ]
 ...
 [-0.21471027  0.          0.         ...  0.          0.
   0.        ]
 [-0.22117928  0.          1.         ...  0.          0.
   0.        ]
 [-0.21807932  0.          1.         ...  0.          0.
   0.        ]] <class 'numpy.ndarray'> (2672388, 546) 2672388


Xtestlr [[-0.22644276  0.          1.         ...  0.          0.
   0.        ]
 [ 0.09314388  0.          0.         ...  1.          0.
   0.        ]
 [-0.2206088   0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.22704553  0.          0.         ...  0.          0.
   0.        ]
 [ 3.403246    1.          0.         ...  0.          0.
   0.        ]
 [-0.18653076  0.          0.         ...  0.          0.
   0.        ]] <class 'numpy.ndarray'> (668098, 546) 668098


y

In [56]:
# AJS: construct the LogisticRegression model
clf = LogisticRegression(multi_class='multinomial',solver ='newton-cg')



In [None]:
# Fit the model on the training data.
clf.fit(Xlr, ylr.ravel()) 