In [439]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.cross_validation import cross_val_score
from sklearn import grid_search

%matplotlib inline

In this challenge, you are given a list of users along with their demographics, web session records, and some summary statistics. You are asked to predict which country a new user's first booking destination will be. All the users in this dataset are from the USA.

There are 12 possible outcomes of the destination country: 'US', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL','DE', 'AU', 'NDF' (no destination found), and 'other'. Please note that 'NDF' is different from 'other' because 'other' means there was a booking, but is to a country not included in the list, while 'NDF' means there wasn't a booking.

The training and test sets are split by dates. In the test set, you will predict all the new users with first activities after 7/1/2014 (note: this is updated on 12/5/15 when the competition restarted). In the sessions dataset, the data only dates back to 1/1/2014, while the users dataset dates back to 2010. 



In [454]:
#summary statistics of users' age group, gender, country of destination
#extract age of each country_destination as it simliar age gorup people tend to go to places with simliar age groups?
age_gender = pd.read_csv("./data/age_gender_bkts.csv")
# summary statistics of destination countries in this dataset and their locations
countries = pd.read_csv("./data/countries.csv")
#training dataset
train_df = pd.read_csv("./data/train_users_2.csv")
#testing dataset
test_df = pd.read_csv("./data/test_users.csv")
#session data
session_df = pd.read_csv("./data/sessions.csv")

In [303]:
print(age_gender.info())
print(age_gender.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420 entries, 0 to 419
Data columns (total 5 columns):
age_bucket                 420 non-null object
country_destination        420 non-null object
gender                     420 non-null object
population_in_thousands    420 non-null float64
year                       420 non-null float64
dtypes: float64(2), object(3)
memory usage: 19.7+ KB
None
  age_bucket country_destination gender  population_in_thousands  year
0       100+                  AU   male                        1  2015
1      95-99                  AU   male                        9  2015
2      90-94                  AU   male                       47  2015
3      85-89                  AU   male                      118  2015
4      80-84                  AU   male                      199  2015


In [330]:
def convert_age_bucket(row):
    return_val = int()
    if '+' in row['age_bucket']:
        return_val = int(row['age_bucket'][0:-1])
    else:
        #split the string by the minus
        two_nums = row['age_bucket'].split('-')
        return_val = (int(two_nums[0]) + int(two_nums[1]))/2
    return return_val

age_gender['age_bucket'] = age_gender.apply(lambda x : convert_age_bucket(x), axis=1)
        
        

age_gender.head()

Unnamed: 0,age_bucket,country_destination,gender,population_in_thousands,year
0,100,AU,male,1,2015
1,97,AU,male,9,2015
2,92,AU,male,47,2015
3,87,AU,male,118,2015
4,82,AU,male,199,2015


In [249]:
new_age_df = age_gender.pivot_table(index='age_bucket', columns='country_destination',values='population_in_thousands')

In [250]:
new_age_df.head()

country_destination,AU,CA,DE,ES,FR,GB,IT,NL,PT,US
age_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,802.5,1018.0,1762.0,1235.0,1986.5,1934.5,1425.5,450.0,232.0,10547.0
7,764.5,985.5,1735.5,1271.0,1953.0,1960.0,1434.0,461.5,257.0,10416.5
12,725.0,956.0,1846.0,1156.5,1939.5,1730.5,1428.0,505.0,277.0,10558.5
17,752.5,1031.5,2025.0,1057.0,1966.0,1823.5,1452.0,497.0,271.5,10797.5
22,803.5,1181.0,2213.5,1140.5,1993.5,2009.0,1557.5,515.5,280.0,11347.5


In [251]:
train_df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [388]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62096 entries, 0 to 62095
Data columns (total 15 columns):
id                         62096 non-null object
date_account_created       62096 non-null object
timestamp_first_active     62096 non-null int64
date_first_booking         0 non-null float64
gender                     62096 non-null object
age                        33220 non-null float64
signup_method              62096 non-null object
signup_flow                62096 non-null int64
language                   62096 non-null object
affiliate_channel          62096 non-null object
affiliate_provider         62096 non-null object
first_affiliate_tracked    62076 non-null object
signup_app                 62096 non-null object
first_device_type          62096 non-null object
first_browser              62096 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 7.6+ MB


In [152]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62096 entries, 0 to 62095
Data columns (total 15 columns):
id                         62096 non-null object
date_account_created       62096 non-null object
timestamp_first_active     62096 non-null int64
date_first_booking         0 non-null float64
gender                     62096 non-null object
age                        33220 non-null float64
signup_method              62096 non-null object
signup_flow                62096 non-null int64
language                   62096 non-null object
affiliate_channel          62096 non-null object
affiliate_provider         62096 non-null object
first_affiliate_tracked    62076 non-null object
signup_app                 62096 non-null object
first_device_type          62096 non-null object
first_browser              62096 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 7.6+ MB


In [342]:
acol = ['gender','signup_method','signup_flow','language',
        'affiliate_channel','affiliate_provider','first_affiliate_tracked','signup_app','first_device_type',
       'first_browser']

for item in acol:
    print '------for column-----',item
    print train_df[item].value_counts()

------for column----- gender
0    111017
1    102434
dtype: int64
------for column----- signup_method
basic       152897
facebook     60008
google         546
dtype: int64
------for column----- signup_flow
0     164739
25     14659
12      9329
3       8822
2       6881
24      4328
23      2835
1       1047
6        301
8        240
21       196
5         36
20        14
16        11
15        10
10         2
4          1
dtype: int64
------for column----- language
en    206314
zh      1632
fr      1172
es       915
ko       747
de       732
it       514
ru       389
pt       240
ja       225
sv       122
nl        97
tr        64
da        58
pl        54
cs        32
no        30
th        24
el        24
id        22
hu        18
fi        14
is         5
ca         5
hr         2
dtype: int64
------for column----- affiliate_channel
direct           137727
sem-brand         26045
sem-non-brand     18844
other              8961
seo                8663
api                8167
content

In [455]:

# some age values are given as years so we need to convert them to the appropriate ages.
def clean_age(row):
    return_val = int()
    if row['age'] >= 2000:
        return_val = np.nan
    elif row['age'] > 1800:
        return_val = 2015 - row['age']
    elif row['age'] > 120:
        return np.nan
    else:
        return row['age']


train_df['age'] = train_df.apply(lambda x : clean_age(x), axis=1)
test_df['age'] = test_df.apply(lambda x : clean_age(x), axis=1)

def impute_age(row):
    return_val = int()
    if pd.isnull(row.age):
        if row.first_browser in age_device_averages:
            return_val = age_device_averages[row.first_browser]
        else:
            return_val = train_df.age.mean()
    else:
        return_val = row.age
    return return_val
    
    

age_device_averages = pd.pivot_table(train_df[pd.notnull(train_df.age)], values='age', index=['first_browser'], aggfunc=np.mean)


#fill NaN values of age to follow the age 

train_df['age'] = train_df.apply(lambda x : impute_age(x), axis=1)
test_df['age'] = test_df.apply(lambda x : impute_age(x), axis=1)
print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id                         213451 non-null object
date_account_created       213451 non-null object
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null object
gender                     213451 non-null object
age                        213451 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 27.7+ MB
None
<class 'pandas.core.fr

In [456]:
catcols = ["date_account_created","gender","signup_method",
           "signup_flow","language","affiliate_channel","signup_app",
          "first_device_type","first_browser"]
#drop date_first_booking as the test set does not have that feature
dropcol = ['date_first_booking','timestamp_first_active','first_affiliate_tracked','affiliate_provider']
train_df.drop(dropcol, axis=1, inplace=True)
train_df.drop('id',axis=1, inplace=True)
test_df.drop(dropcol,axis=1, inplace=True)

#impute first_browser
train_df['first_browser'][train_df.first_browser =='-unknown-'] = 'Chrome'

#gender conversion to be 0 and 1
def convert_gender(row):
    if row.gender == "FEMALE":
        return 0
    if row.gender == "MALE":
        return 1
    else:
        return np.random.randint(2)

train_df['gender'] = train_df.apply(lambda x : convert_gender(x), axis=1)
test_df['gender'] = test_df.apply(lambda x : convert_gender(x), axis=1)


#date_account_created conver to date_month_created
def convert_month(row):
    split_date = row['date_account_created'].split('-')
    month = split_date[1]
    return month

train_df['date_account_created'] = train_df.apply(lambda x : convert_month(x), axis=1)
test_df['date_account_created'] = test_df.apply(lambda x : convert_month(x), axis=1)

#map categorical columns
train_df['country_destination'] = train_df['country_destination'].astype('category')
for col in catcols:
#     train_df[col] = train_df[col].astype('category')
#     test_df[col] = test_df[col].astype('category')
#     train_df[col] = trian_df[col].
    dummies = pd.get_dummies(train_df[col]).rename(columns=lambda x: 'Category_' + str(x))
    dummies2 = pd.get_dummies(test_df[col]).rename(columns=lambda x: 'Category_' + str(x))
    train_df = pd.concat([train_df, dummies], axis=1)
    test_df = pd.concat([test_df, dummies2], axis=1)
    train_df.drop([col], inplace=True, axis=1)
    test_df.drop([col], inplace=True, axis=1)




A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [457]:
labels = train_df['country_destination'].cat.codes.values
train_df.drop('country_destination',inplace=True,axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(train_df, labels, test_size=0.2, random_state=1)

In [458]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,Y_train)
print classification_report(Y_test,clf.predict(X_test))
print "cross val score: {}".format(cross_val_score(clf, train_df, labels, cv=5).mean())

# treeclf = DecisionTreeClassifier(max_depth=20)
# treeclf.fit(X_train,Y_train)
# print classification_report(Y_test,treeclf.predict(X_test))
# print "cross val score: {}".format(cross_val_score(treeclf, X, y, cv=5).mean())

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       102
          1       0.01      0.00      0.01       282
          2       0.03      0.01      0.01       213
          3       0.01      0.00      0.00       459
          4       0.02      0.01      0.01       963
          5       0.02      0.01      0.01       488
          6       0.02      0.01      0.01       586
          7       0.67      0.80      0.73     24998
          8       0.00      0.00      0.00       140
          9       0.00      0.00      0.00        43
         10       0.44      0.39      0.42     12427
         11       0.07      0.03      0.04      1990

avg / total       0.53      0.58      0.55     42691

cross val score: 0.534345020132


In [450]:
labels

array([ 7,  7, 10, ...,  7,  7,  7], dtype=int8)