In [3]:
import numpy as np
import pandas as pd
import pandas_td as td
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
from scipy import interp

# Work after Project Update1

df_train = pd.read_csv('train_users_3.csv')
df_sessions=pd.read_csv('sessions.csv')

#replace NaN with -1
av = df_train.age.values
df_train['age'] = np.where(np.logical_or(av<14, av>100), -1, av)
df_train.age.replace(np.nan, '-1' , inplace=True)
df_train.gender.replace(np.nan, '-1' , inplace=True)
df_train.first_affiliate_tracked.replace(np.nan, '-1' , inplace=True)

'''df_sessions['id']=df_sessions['user_id']
df_sessions=df_sessions.drop(['user_id'],axis=1)
df_sessions.action=df_sessions.action.fillna("NaN")
df_sessions.action_type=df_sessions.action_type.fillna("NaN")
df_sessions.action_detail=df_sessions.action_detail.fillna("NaN")
df_sessions.device_type=df_sessions.device_type.fillna("NaN")
df_sessions.secs_elapsed=df_sessions.secs_elapsed.fillna(0.0)

threshold = 100  # Remove items less than or equal to threshold
vc = df_sessions['action'].value_counts()
vals_to_remove = vc[vc <= threshold].index.values
df_sessions['action'].loc[df_sessions['action'].isin(vals_to_remove)] = 'Other' 
'''
#Feature Engineering
print("Features for Users Dataset")
date_acc=pd.to_datetime(df_train['date_account_created'])

df_train['year_account_created'] = date_acc.dt.year   
df_train['month_account_created']=date_acc.dt.month
df_train['day_account_created']=date_acc.dt.day

val=df_train["timestamp_first_active"].values
year=[]
month=[]
day=[]
hour=[]
minutes=[]
seconds=[]
for x in val:
    year.append(x.astype(str)[:4])
    month.append(x.astype(str)[4:6])
    day.append(x.astype(str)[6:8])
    hour.append(x.astype(str)[8:10])
    minutes.append(x.astype(str)[10:12])
    seconds.append(x.astype(str)[12:14])

df_train['year_first_active']=year
df_train['month_first_active']=month
df_train['day_first_active']=day
df_train['hour_first_active']=hour
df_train['minute_first_active']=minutes
df_train['seconds_first_active']=seconds

df_train['day_first_active']= pd.to_numeric(df_train['day_first_active'])
df_train['month_first_active']= pd.to_numeric(df_train['month_first_active'])
df_train['year_first_active']= pd.to_numeric(df_train['year_first_active'])
df_train['hour_first_active']= pd.to_numeric(df_train['hour_first_active'])
df_train['minute_first_active']= pd.to_numeric(df_train['minute_first_active'])
df_train['seconds_first_active']= pd.to_numeric(df_train['seconds_first_active'])
df_train['age']= pd.to_numeric(df_train['age'])


df_train=df_train.drop('date_account_created',1)
df_train=df_train.drop('timestamp_first_active',1)
df_train=df_train.drop('date_first_booking',1)

country_num_dic = {'NDF': 0, 'US': 1, 'other': 2, 'FR': 3, 'IT': 4, 'GB': 5, 'ES': 6, 'CA': 7, 'DE': 8, 'NL': 9, 'AU': 10, 'PT': 11}
num_country_dic = {y:x for x,y in country_num_dic.items()}
df_train['country_destination']= df_train['country_destination'].map(country_num_dic)

'''f_act = df_sessions.action.value_counts().argsort()
f_act_detail = df_sessions.action_detail.value_counts().argsort()
f_act_type = df_sessions.action_type.value_counts().argsort()
f_dev_type = df_sessions.device_type.value_counts().argsort()
      
print("Features for Sessions Dataset")
grpd_sessions=df_sessions.groupby(['id'])
samples=[]
cont=0
for g in grpd_sessions:
    if cont%10000 == 0:
        print("%s of %s users' session data calculated" %(cont, len(grpd_sessions)))
    grp=g[1]
    l=[]
    l.append(g[0])
    l.append(len(grp))
    secs=grp.secs_elapsed.fillna(0).values
    
    c_act = [0] * len(f_act)
    for i,v in enumerate(grp.action.values):
        c_act[f_act[v]] += 1
    _, c_act_uqc = np.unique(grp.action.values, return_counts=True)
    c_act += [len(c_act_uqc), np.mean(c_act_uqc)]
    l = l + c_act
    
    c_act_detail = [0] * len(f_act_detail)
    for i,v in enumerate(grp.action_detail.values):
        c_act_detail[f_act_detail[v]] += 1 
    _, c_act_det_uqc = np.unique(grp.action_detail.values, return_counts=True)
    c_act_detail += [len(c_act_det_uqc), np.mean(c_act_det_uqc)]
    l = l + c_act_detail
    
    #action_type features
    #(how many times each value occurs, numb of unique values, mean and std
    #+ log of the sum of secs_elapsed for each value)
    l_act_type = [0]*len(f_act_type)
    c_act_type = [0]*len(f_act_type)
    for i,v in enumerate(grp.action_type.values):
        l_act_type[f_act_type[v]] += secs[i]   
        c_act_type[f_act_type[v]] += 1  
    l_act_type = np.log(1 + np.array(l_act_type)).tolist()
    _, c_act_type_uqc = np.unique(grp.action_type.values, return_counts=True)
    c_act_type += [len(c_act_type_uqc), np.mean(c_act_type_uqc)]
    l = l + c_act_type + l_act_type    
    
    #device_type features
    #(how many times each value occurs, numb of unique values, mean and std)
    c_dev_type  = [0]*len(f_dev_type)
    for i,v in enumerate(grp.device_type .values):
        c_dev_type[f_dev_type[v]] += 1 
    c_dev_type.append(len(np.unique(grp.device_type.values)))
    _, c_dev_type_uqc = np.unique(grp.device_type.values, return_counts=True)
    c_dev_type += [len(c_dev_type_uqc), np.mean(c_dev_type_uqc)]        
    l = l + c_dev_type    
    
    secs_features=[0]*4
    if(len(secs)>0):
        secs_features[0]=np.log(1+np.sum(secs))
        secs_features[1]=np.log(1+np.mean(secs))
        secs_features[2]=np.log(1+np.median(secs))
        secs_features[3]=np.log(1+np.std(secs))
    l=l+secs_features
    samples.append(l)
    cont+=1

col_names = []    
for i in range(len(samples[0])-1):
    col_names.append('secs_elapsed_' + str(i)) 
samples = np.array(samples)
samp_ar = np.float_(samples[:, 1:])
samp_id = samples[:, 0]   
       
df_agg_sess = pd.DataFrame(samp_ar, columns=col_names)
df_agg_sess['id'] = samp_id
df_agg_sess.index = df_agg_sess.id    

#Merge Users and Sessions
df_train = pd.merge(df_train, df_agg_sess,on ='id', how='left') 
df_train=df_train.fillna(-1)'''

#One-Hot Encoding
print("One hot Encoding")
categorical=['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 
             'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for x in categorical:
    categorical_dummy=pd.get_dummies(df_train[x])
    df_train=df_train.drop([x],axis=1)
    df_train=pd.concat((df_train,categorical_dummy), axis=1)

#Test-Train Split
print("Splitting into Training and Test")
df_labels=df_train['country_destination']
df_labels = label_binarize(df_labels, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
df_labels=pd.DataFrame(df_labels)
df_train_labels=df_labels.loc[0:(np.floor(2*len(df_train)/3))]
df_test_labels=df_labels.loc[(np.floor(2*len(df_train)/3)):]
n_classes = df_labels.shape[1]
      
df_train=df_train.drop("id",axis=1)
df_test=df_train.loc[(np.floor(2*len(df_train)/3)):]
df_train_new=df_train.loc[0:(np.floor(2*len(df_train)/3))]
df_train_features=df_train_new.drop("country_destination",axis=1)
df_test_features=df_test.drop("country_destination",axis=1)

print("Feature Set")
df_train_features.info()
print("building model now")

#Models - Decision Tree, Linear SVC
#model=OneVsRestClassifier(DecisionTreeClassifier(min_samples_leaf=150)).fit(df_train_features,df_train_labels)

#model = svm.LinearSVC(C=1).fit(df_train_features, df_train_labels)

#gnb = GaussianNB()
#model=OneVsRestClassifier(gnb).fit(df_train_features,df_train_labels)

sgd=linear_model.SGDClassifier(loss="log",alpha=.01, n_iter=150)
model=OneVsRestClassifier(sgd).fit(df_train_features, df_train_labels)

#rfc = RandomForestClassifier(n_estimators=15, max_depth=None,min_samples_split=1, random_state=0, min_samples_leaf=10)
#model=(OneVsRestClassifier(rfc)).fit(df_train_features, df_train_labels)

#etc=ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
#model=OneVsRestClassifier(etc).fit(df_train_features, df_train_labels)  

#model=OneVsRestClassifier(AdaBoostClassifier(n_estimators=150, learning_rate=.5, random_state=0)).fit(df_train_features,df_train_labels)
print("printing results without Sessions")      
#Multiclass    
sgd=linear_model.SGDClassifier(loss="log",alpha=.01, n_iter=150)
model=OneVsRestClassifier(sgd).fit(df_train_features, df_train_labels)

y_score=model.decision_function(df_test_features)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
y_test_labels=np.array(df_test_labels, dtype=pd.Series)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_labels[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    print("ROC for class",i ,"is:",roc_auc[i].round(3))


Features for Users Dataset
One hot Encoding
Splitting into Training and Test
Feature Set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142301 entries, 0 to 142300
Columns: 157 entries, age to wOSBrowser
dtypes: float64(148), int64(9)
memory usage: 170.5 MB
building model now
printing results without Sessions
ROC for class 0 is: 0.72
ROC for class 1 is: 0.694
ROC for class 2 is: 0.602
ROC for class 3 is: 0.601
ROC for class 4 is: 0.569
ROC for class 5 is: 0.614
ROC for class 6 is: 0.57
ROC for class 7 is: 0.616
ROC for class 8 is: 0.642
ROC for class 9 is: 0.621
ROC for class 10 is: 0.647
ROC for class 11 is: 0.504


In [29]:
#sgd=linear_model.SGDClassifier(loss="hinge",alpha=.01, n_iter=250)
#model=OneVsRestClassifier(sgd).fit(df_train_features, df_train_labels)
model=OneVsRestClassifier(AdaBoostClassifier(n_estimators=150, learning_rate=.5, random_state=0)).fit(df_train_features,df_train_labels)

print("printing results")      
#Multiclass           
y_score=model.decision_function(df_test_features)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
y_test_labels=np.array(df_test_labels, dtype=pd.Series)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_labels[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    print("ROC for class",i ,"is:",roc_auc[i].round(3))



printing results
ROC for class 0 is: 0.824
ROC for class 1 is: 0.781
ROC for class 2 is: 0.619
ROC for class 3 is: 0.592
ROC for class 4 is: 0.547
ROC for class 5 is: 0.55
ROC for class 6 is: 0.465
ROC for class 7 is: 0.444
ROC for class 8 is: 0.532
ROC for class 9 is: 0.421
ROC for class 10 is: 0.487
ROC for class 11 is: 0.446
