In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime,timedelta
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('train_users_2.csv/train_users_2.csv')
print(df.shape)
df['date_account_created'] = df.date_account_created.apply(lambda x : datetime.strptime(x,'%d-%m-%Y'))
df['date_first_booking'] = pd.to_datetime(df['date_first_booking'],errors = 'coerce')
df.sort_values('date_account_created', inplace = True)
df = df.reset_index()
del df['index']
df.head()

### Derived Variables

In [None]:
df['days_since_first_day'] = [ date - df.loc[0,'date_account_created'] for date in df['date_account_created']]
df['days_since_first_day'] = df['days_since_first_day'].dt.days
df['Day'] = df.date_account_created.apply(lambda x : x.day)
df['Month'] = df.date_account_created.apply(lambda x :x.month)
df['Year'] = df.date_account_created.apply(lambda x :x.year)
df['Quarter'] =df.date_account_created.apply(lambda x :np.ceil(x.month/3))
df['Week_Year'] = df.date_account_created.apply(lambda x :np.ceil((x.isocalendar()[1])))

In [None]:
df['timestamp_first_active'] = df['timestamp_first_active'].astype(str)
df['timestamp_first_active']  = df.timestamp_first_active.apply(lambda x: datetime.strptime(x,'%Y%m%d%H%M%S'))
df['firstactivediff'] = (df['timestamp_first_active'] - df['date_account_created']).dt.days
df['firstactivediff'].describe()

### Analysis, Missing and Outlier Treatment

In [None]:
# ID vs Date
cnt_users = df.groupby('date_account_created')['id'].size()
plt.scatter(cnt_users.index,cnt_users)
plt.xlabel('Date')
plt.ylabel('Count of Users')
plt.show()

In [None]:
# First Booking
print(df.date_first_booking.isnull().sum()/df.shape[0]*100)
diff = df['date_first_booking'] - df['date_account_created']
print(diff.describe())
#diff[diff.notnull()]
sum(diff[diff.notnull()]<'0 days')

In [None]:
# Distribution of other Categorical columns
print(df.groupby('signup_method').size())
print(df.groupby('language').size())
print(df.groupby('affiliate_channel').size())
print(df.groupby('affiliate_provider').size())
print(df.groupby('first_affiliate_tracked').size())
print(df.groupby('signup_app').size())
print(df.groupby('first_device_type').size())
print(df.groupby('first_browser').size())
print(df.groupby('country_destination').size()/len(df)*100)

In [None]:
# Signup Flow
plt.hist(df.signup_flow)
plt.xlabel('SignUp Flow')
plt.show()

In [None]:
# Gender
df['gender'] = [g.replace('-unknown-','None') for g in df['gender']]
print((len([g for g in df['gender'] if g == 'None'])/df.shape[0])* 100)
print(df.groupby('gender')['id'].size()/df.shape[0])

In [None]:
# Imputing Missing Values in Gender

# Converting categorical to dummy variables
categorical = df.iloc[:,np.r_[4,6,8:16]]
dummy_categorical = pd.get_dummies(categorical)
df_num_dum = pd.concat([df.iloc[:,np.r_[0:4,5,7,16:len(df.columns)]],dummy_categorical],axis=1)
df_all = pd.concat([df_num_dum,df.iloc[:,np.r_[4,6,8:16]]],axis=1)


test = df_num_dum[df_num_dum['gender_None']==1]
train = df_num_dum[df_num_dum['gender_None']!=1]
X_train = train[train.columns.difference(['gender_OTHER','id','gender_None','gender_FEMALE','gender_MALE','age','date_account_created','timestamp_first_active','date_first_booking'])]
X_test = test[test.columns.difference(['id','gender_None','gender_OTHER','gender_FEMALE','gender_MALE','age','date_account_created','timestamp_first_active','date_first_booking'])] 
Y_train = df_all.loc[df_all['gender_None']!=1,'gender']
Y_test = df_all.loc[df_all['gender_None']==1,'gender']


In [None]:
# Removing multicollinear variables
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif["features"] = X_train.columns
noinf = list(vif[~np.isinf(vif['VIF Factor'])].features)
vif2 = pd.DataFrame()
filt_Xtrain = X_train[noinf]
vif2["VIF Factor"] = [variance_inflation_factor(filt_Xtrain.values, i) for i in range(filt_Xtrain.shape[1])]
vif2["features"] = filt_Xtrain.columns
viffilt = list(vif2[vif2['VIF Factor'] < 50].features)
filt2_Xtrain = filt_Xtrain[viffilt]
vif3 = pd.DataFrame()
vif3["VIF Factor"] = [variance_inflation_factor(filt2_Xtrain.values, i) for i in range(filt2_Xtrain.shape[1])]
vif3["features"] = filt2_Xtrain.columns
filt = list(vif3[vif3['VIF Factor'] < 6].features)
filt3_Xtrain = filt_Xtrain[filt]


# Linear Discriminant Analysis to predict Gender
clf = LinearDiscriminantAnalysis()
clf.fit(filt3_Xtrain,Y_train)

In [None]:
# Prediction and checking
pred = clf.predict(X_test[filt])
pred_gender = pd.DataFrame({'gender_pred': pred},index = Y_test.index)
print(pred_gender.groupby('gender_pred').size()/pred_gender.shape[0])

#Imputing Gender with predicted values
df.loc[df.gender == 'None','gender'] = list(pred)
print(df.groupby('gender').size()/df.shape[0])

In [None]:
# Age
#Checking percentile
print(np.nanpercentile(df.age,[0,1,5,10,50,90,95,99]))
print(len(df.loc[df.age<6,'age']))
# Removing rows with age < 6 
df = df[(df.age > 5) | np.isnan(df.age) ]
print(df.shape)

# Imputing age with values >105 to 105
df.loc[df.age > 105.0,'age'] = 105.0

#Imputing Missing values with median
print(df.age.isnull().sum()/df.shape[0]* 100)
df.loc[df.age.isnull(),'age'] = df.age.median()

plt.hist(df.age)
plt.xlabel('Age after imputation')
plt.show()

### Train - Test Spliting

In [None]:
df.loc[df.date_account_created < '2014-02-01 00:00:00','Split'] = "Train"
df.loc[df.date_account_created >= '2014-02-01 00:00:00','Split'] = "Test"
df.groupby('Split').size()/len(df)*100

### Removing multicollinear variables 

In [None]:
categorical = df.iloc[:,np.r_[4,6,8:15]]
dummy_categorical = pd.get_dummies(categorical)
inp = pd.concat([df.iloc[:,np.r_[22,15,5,7]],dummy_categorical,],axis=1)
inp = inp[inp.columns.difference(['gender_OTHER','signup_method_basic','language_zh','affiliate_channel_other','affiliate_provider_other','first_affiliate_tracked_untracked','signup_app_iOS','first_device_type_Other/Unknown','first_browser_-unknown-',])]
temp = inp[inp.columns.difference(['Split','country_destination'])]
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(temp.values, i) for i in range(temp.shape[1])]
vif["features"] = temp.columns

### ReSampling

In [None]:
from sklearn.utils import resample
upsample = pd.DataFrame()
dest = inp['country_destination'].unique()
for i in dest:
    if i in ["NDF","US","other"]:
        t = resample(inp[inp['country_destination'] == i],replace = False,n_samples =10000,random_state = 1234)
        upsample = upsample.append(t,ignore_index=True)
    else:
        t = resample(inp[inp['country_destination'] == i],n_samples = 10000,random_state = 1234)
        upsample = upsample.append(t,ignore_index=True)
print(upsample.shape)


trn_X = upsample.loc[upsample.Split == 'Train',vif[vif['VIF Factor']<5].features]
test_X = upsample.loc[upsample.Split == 'Test',vif[vif['VIF Factor']<5].features]
trn_Y = upsample.loc[upsample.Split == 'Train','country_destination']
test_Y = upsample.loc[upsample.Split == 'Test','country_destination']

### NDCG - Evaluation Metric Calculation 

In [None]:
def dcg(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)
def ndcg(ground_truth, predictions, k=5):
    t = pd.DataFrame({'test':test_Y})
    T = (pd.get_dummies(t)).as_matrix()
    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

### Logistic Regression 

In [None]:
model_lr_resample = linear_model.LogisticRegression(multi_class = 'multinomial',solver ='lbfgs',max_iter = 100)
result_lr_resample = model_lr_resample.fit(trn_X,trn_Y)
pred_lr_resample = model_lr_resample.predict_proba(test_X)
metrics.accuracy_score(test_Y,pred_lr_resample)

In [None]:
# Using LBFGS Solver
model_lr_resample = linear_model.LogisticRegression(multi_class = 'multinomial',solver ='lbfgs',max_iter = 100)
result_lr_resample = model_lr_resample.fit(trn_X,trn_Y)
pred_lr_resample = model_lr_resample.predict_proba(test_X)
ndcg(test_Y_tr,pred_lr_resample)

In [None]:
# NDCG Score for 2-Fold CV
cv = inp[inp.columns.difference(['Split','country_destination'])]
target = inp['country_destination']
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=2) # Define the split - into 10 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    pred_cv_lr_prob = model_lr_resample.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_lr_prob))
print(res)
print(np.mean(res))

In [None]:
# NDCG Score for 5-Fold CV
cv = inp[inp.columns.difference(['Split','country_destination'])]
target = inp['country_destination']
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=5) # Define the split - into 10 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    pred_cv_lr_prob = model_lr_resample.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_lr_prob))
print(res)
print(np.mean(res))

In [None]:
# NDCG Score for 5-Fold CV
cv = inp[inp.columns.difference(['Split','country_destination'])]
target = inp['country_destination']
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 10 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    pred_cv_lr_prob = model_lr_resample.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_lr_prob))
print(res)
print(np.mean(res))

### XgBoost Classifier

In [None]:
model_xgb = xgb.XGBClassifier(eta = 0.2,base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1,eval_metric = 'ndcg@5')
model_xgb.fit(trn_X, trn_Y)
pred_xgb = model_xgb.predict_proba(test_X)
ndcg(test_Y,pred_xgb)

In [None]:
# NDCG Score for 2-Fold CV
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=2) # Define the split - into 2 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    pred_cv_xgb_prob = model_xgb.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_xgb_prob))
print(res)
print(np.mean(res))

In [None]:
# NDCG Score for 5-Fold CV
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=5) # Define the split - into 2 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    pred_cv_xgb_prob = model_xgb.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_xgb_prob))
print(res)
print(np.mean(res))

In [None]:
# NDCG Score for 10-Fold CV
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 2 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    pred_cv_xgb_prob = model_xgb.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_xgb_prob))
print(res)
print(np.mean(res))

### Random Forest Classifier

In [None]:
model_rf = RandomForestClassifier()
model_rf.fit(trn_X, trn_Y)

pred_rf = model_rf.predict(test_X)
pred_rf_prob = model_rf.predict_proba(test_X)

print(collections.Counter(pred_rf))

ndcg(test_Y,pred_rf_prob)

In [None]:
# 10-Fold CV - max depth=4
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 2 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    model_rf = RandomForestClassifier(n_estimators=600,criterion='gini', max_depth=4)
    pred_cv_rf_prob = model_rf.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_rf_prob))
print(res)
print(np.mean(res))

In [None]:
# 10-Fold CV - max depth=3
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 2 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    model_rf = RandomForestClassifier(n_estimators=600,criterion='gini', max_depth=3)
    pred_cv_rf_prob = model_rf.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_rf_prob))
print(res)
print(np.mean(res))

In [None]:
# 10-Fold CV - max depth=2
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 2 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    model_rf = RandomForestClassifier(n_estimators=600,criterion='gini', max_depth=2)
    pred_cv_rf_prob = model_rf.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_rf_prob))
print(res)
print(np.mean(res))

In [None]:
# 10-Fold CV - max depth=1
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 2 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    model_rf = RandomForestClassifier(n_estimators=600,criterion='gini', max_depth=4)
    pred_cv_rf_prob = model_rf.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_rf_prob))
print(res)
print(np.mean(res))

### Multinomial Naive-Bayes Classifier

In [None]:
mlNB = MultinomialNB()
mlNB.fit(trn_X,trn_Y)
pred_NB = mlNB.predict_proba(test_X)
ndcg(test_Y,pred_NB)

In [None]:
#10-fold CV 
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 5 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
mlNB = MultinomialNB()
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    pred_cv_NB_prob = mlNB.fit(X_train,y_train).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_NB_prob))
#print(res)
print(np.mean(res))

### Neural Network

In [None]:
# 10-fold CV with 1 layer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
model = Sequential()
model.add(Dense(20, input_dim=96, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(12, init='uniform', activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


kf = KFold(n_splits=10) # Define the split - into 5 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    train_Y_cat = pd.get_dummies(y_train)
    pred_cv_nn_prob = model.fit(X_train,train_Y_cat,epochs=50, batch_size=10).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_nn_prob))
#print(res)
print(np.mean(res))


In [None]:
# 10-fold CV with 2 layers - 10,10
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
model = Sequential()
model.add(Dense(20, input_dim=96, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(12, init='uniform', activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


kf = KFold(n_splits=10) # Define the split - into 5 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    train_Y_cat = pd.get_dummies(y_train)
    pred_cv_nn_prob = model.fit(X_train,train_Y_cat,epochs=50, batch_size=10).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_nn_prob))
#print(res)
print(np.mean(res))


In [None]:
# 10-fold CV with 2 layers - 10,8 nodes 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
model = Sequential()
model.add(Dense(20, input_dim=96, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dense(12, init='uniform', activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


kf = KFold(n_splits=10) # Define the split - into 5 folds 
kf.get_n_splits(cv) # returns the number of splitting iterations in the cross-validator
res=[]
for train_index, test_index in kf.split(cv):
    X_train, X_test = cv[cv.index.isin(train_index)], cv[cv.index.isin(test_index)]
    y_train, y_test = target[cv.index.isin(train_index)], target[cv.index.isin(test_index)]
    train_Y_cat = pd.get_dummies(y_train)
    pred_cv_nn_prob = model.fit(X_train,train_Y_cat,epochs=50, batch_size=10).predict_proba(X_test)
    test_Y_tr = le.transform(y_test)
    res.append(ndcg(test_Y_tr,pred_cv_nn_prob))
#print(res)
print(np.mean(res))
