#Importing and Unzipping the data

In [1]:
#Importing packages necessary
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
os.chdir('drive/My Drive')

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import calendar
import seaborn as sns
from scipy.sparse import hstack, vstack
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import sklearn
from sklearn.metrics import make_scorer, ndcg_score
import pickle

In [9]:
'''Extract features like weekday, year, month from date features available'''

def extract_weekday(format, datecolumn):
  weekdays = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
  day_of_week = [weekdays[datetime.strptime(str(x), format).weekday()] for x in datecolumn]
  return day_of_week

def extract_month(format, datecolumn):
  month = [calendar.month_name[datetime.strptime(str(x), format).month] for x in datecolumn]
  return month

def extract_year(format, datecolumn):
  years = [datetime.strptime(str(x), format).year for x in datecolumn]
  return years

def extract_date(format, datecolumn):
  date = [datetime.strptime(str(x), format).day for x in datecolumn]
  return date

#https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python
def extract_weeknum(format, datecolumn):
  weeknum = [datetime.strptime(str(x), format).isocalendar()[1] for x in datecolumn]
  return weeknum


In [10]:
def difference_of_days(dac,tfa):

  '''Calculate number of days difference between date first active and date account created feature '''
  
  #Convert them to date format 
  dac_dt = datetime.strptime(str(dac), '%Y-%m-%d')
  tfa_dt = datetime.strptime(str(tfa), '%Y%m%d%H%M%S')
  #Find the difference
  no_of_days = dac_dt-tfa_dt
  return no_of_days.days


In [11]:
def session_features(feature_name, feature_values,X):

  '''Create features in train and test data using dictionary obtained using grouping of sessions data '''

  #fetch values from dictionary and create new features in train and test data.
  X[feature_name] = [feature_values.get(x,-1) for x in X['id']]
  
  return X

In [30]:
def impute_nulls(feature,feature_type,model,X,cat_cols):

  '''Function to impute null values in a feature. 
  We will take the non-null data to train and encode all categorical columns (as age is only numerical and also it has null values)
  and fit the data and predict with data that has null values in train and test data '''
    
  #Separate null valued rows of given feature from train and test
  if feature_type=='numerical':
    null_test = X[X[feature]==-1]

  if feature_type=='categorical':
    null_test = X[X[feature]=='nan']
  
  if len(null_test)==0:
    return X

  null_test_enc = model[0].transform(null_test[cat_cols])
  imputed_null_test = model[1].predict(null_test_enc)

  X.loc[null_test.index, feature] = imputed_null_test

  return X

In [33]:
def preprocess(X):
  
  '''Used to preprocess the data.
  '''

  X['age'] = [np.NaN if ((x>120 and x not in range(1900,2000)) or x<10) else x for x in X['age']]
  X['day_account_created'] = extract_weekday('%Y-%m-%d', X['date_account_created'])
  X['month_account_created'] = extract_month('%Y-%m-%d', X['date_account_created'])
  X['year_account_created'] = extract_year('%Y-%m-%d', X['date_account_created'])
  X['date__account_created'] = extract_date('%Y-%m-%d', X['date_account_created'])
  X['week_account_created'] = extract_weeknum('%Y-%m-%d', X['date_account_created'])
  X['age'] = [int(X.loc[x]['year_account_created'])-X.loc[x]['age'] if X.loc[x]['age'] in range(1900,2000) else X.loc[x]['age'] for x in X.index]
  X['day_first_active'] = extract_weekday('%Y%m%d%H%M%S', X['timestamp_first_active'])
  X['month_first_active']=extract_month('%Y%m%d%H%M%S', X['timestamp_first_active'])
  X['year_first_active']=extract_year('%Y%m%d%H%M%S', X['timestamp_first_active'])
  X['date_first_active'] = extract_date('%Y%m%d%H%M%S', X['timestamp_first_active'])
  X['week_first_active'] = extract_weeknum('%Y%m%d%H%M%S', X['timestamp_first_active'])
  X['days_between_tfa_dac'] = X.apply(lambda x: difference_of_days(x.date_account_created,x.timestamp_first_active), axis=1)
  # X.drop(['date_first_booking'], axis=1, inplace=True)
  X['age'] = X['age'].fillna(1)
  X['gender'] = X['gender'].astype('str')
  X['age_gender_missing'] = [(2 if (x==1 and y=='-unknown-') else (1 if (x==1 or y=='-unknown-') else 0)) for x,y in zip(X['age'],X['gender'])]
  X['first_affiliate_tracked'] = X['first_affiliate_tracked'].astype('str')
  X['signup_flow'] = X['signup_flow'].astype('str')

  sessions = pd.read_csv('sessions.csv')
  threshold = 8444+(1.5*(8444-229))
  sessions = sessions[sessions['secs_elapsed']<threshold]
  sessions['action_type'] = sessions['action_type'].astype('str')
  sessions['action'] = sessions['action'].astype('str')
  sessions['action_detail'] = sessions['action_detail'].astype('str')
  sessions['device_type'] = sessions['device_type'].astype('str')

  mean_elapsed_times = sessions.groupby('user_id')['secs_elapsed'].mean().to_dict()
  X = session_features('mean_session_time',mean_elapsed_times, X)
  total_elapsed_times = sessions.groupby('user_id')['secs_elapsed'].sum().to_dict()
  X = session_features('total_session_time',total_elapsed_times, X)
  sessions_per_user = sessions.groupby('user_id').count()['secs_elapsed'].to_dict()
  X = session_features('sessions_per_user',sessions_per_user, X)
  unique_devices_per_user = sessions.groupby('user_id')['device_type'].nunique()
  X = session_features('unique_devices_per_user',unique_devices_per_user, X)
  X['unique_devices_per_user'].replace(-1,X['unique_devices_per_user'].mean(), inplace=True)

  sessions['action_type'] = sessions['action_type'].astype('str')
  sessions['action'] = sessions['action'].astype('str')
  sessions['action_detail'] = sessions['action_detail'].astype('str')
  sessions['device_type'] = sessions['device_type'].astype('str')

  infile = open('Session_dfs.pkl', 'rb')
  most_occuring_categories, df = pickle.load(infile)
  infile.close()

  most_occuring_action = most_occuring_categories['action'].to_dict()
  most_occuring_action_type = most_occuring_categories['action_type'].to_dict()
  most_occuring_action_detail = most_occuring_categories['action_detail'].to_dict()
  most_used_device = most_occuring_categories['device_type'].to_dict()

  X['most_occuring_action'] = [most_occuring_action.get(x,'nan') for x in X['id']]
  X['most_occuring_action_type'] = [most_occuring_action_type.get(x,'nan') for x in X['id']]
  X['most_occuring_action_detail'] = [most_occuring_action_detail.get(x,'nan') for x in X['id']]
  X['most_used_device'] = [most_used_device.get(x,'nan') for x in X['id']]

  X['max_elapsed_action_type'] = [df.loc[x][1] if x in df.index else 'nan' for x in X['id']]
  X['max_elapsed_action'] = [df.loc[x][2] if x in df.index else 'nan' for x in X['id']]
  X['max_elapsed_time'] = [df.loc[x][3] if x in df.index else -1 for x in X['id']]

  cat_cols = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
        'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
        'first_device_type', 'first_browser', 'day_account_created', 'month_account_created', 'year_account_created',
        'date__account_created', 'week_account_created', 'day_first_active',
        'month_first_active', 'year_first_active', 'date_first_active',
        'week_first_active','age_gender_missing']


  infile = open('ImputingModels.pkl', 'rb')
  models = pickle.load(infile)
  infile.close()
  sessions_numcols = ['mean_session_time','total_session_time','sessions_per_user','max_elapsed_time']
  sessions_catcols = ['max_elapsed_action_type','max_elapsed_action','most_occuring_action','most_occuring_action_type','most_occuring_action_detail',
                    'most_used_device']
  sessions_cols = sessions_catcols+sessions_numcols
  for i,feature in enumerate(sessions_cols):
    if feature in sessions_catcols:
      X=impute_nulls(feature,'categorical',models[i],X,cat_cols)  
    if feature in sessions_numcols:
      X=impute_nulls(feature,'numerical',models[i],X,cat_cols)  

  return X

In [34]:
def encode_transform(X):
  
  num_cols = ['age','days_between_tfa_dac','mean_session_time','total_session_time','sessions_per_user','max_elapsed_time', 'unique_devices_per_user']
  cat_cols = ['gender',
        'signup_method', 'signup_flow', 'language', 'affiliate_channel',
        'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
        'first_device_type', 'first_browser',
        'day_account_created', 'month_account_created', 'year_account_created',
        'date__account_created', 'week_account_created', 'day_first_active',
        'month_first_active', 'year_first_active', 'date_first_active',
        'week_first_active', 'age_gender_missing', 'most_occuring_action',
        'most_occuring_action_type', 'most_occuring_action_detail',
        'most_used_device', 'max_elapsed_action_type', 'max_elapsed_action']

  infile = open('Final_Encoder_and_XGBModel.pkl', 'rb')
  encoder,model,le = pickle.load(infile)
  infile.close()
  X_cat = encoder.transform(X[cat_cols])
  X_num = np.array(X[num_cols])
  X_transformed = hstack((X_cat, X_num))
  
  return X_transformed

In [31]:
def final_fun_1(X):
  ''' Function to take raw data as input, do the preprocessing, feed it to model, generate the predictions and return them'''

  #Preprocess
  X = preprocess(X)
  X_transformed = encode_transform(X)

  y_pred = model.predict_proba(X_transformed)
  predictions = np.array([le.inverse_transform((-1*x).argsort()[:5]) for x in y_pred]).flatten()
  test_ids = np.array([[x]*5 for x in X['id']]).flatten()
  submission = pd.DataFrame(np.stack((test_ids, predictions), axis=1), columns=['id','country'])

  return submission

In [32]:
test = pd.read_csv('test_users.csv')
X = test.iloc[400:403]
final_fun_1(X)


Unnamed: 0,id,country
0,79vwk4652q,NDF
1,79vwk4652q,US
2,79vwk4652q,other
3,79vwk4652q,FR
4,79vwk4652q,IT
5,1lyute4nlz,NDF
6,1lyute4nlz,US
7,1lyute4nlz,other
8,1lyute4nlz,FR
9,1lyute4nlz,ES


#Function 2

In [53]:
def final_fun_2(X,y):
  
  ''' Function to take raw data along with target variables as input, and calculate and return the metric with predicted and original labels '''
  
  X = preprocess(X)
  X_transformed = encode_transform(X)

  y_pred = model.predict_proba(X_transformed)
  
  ohe = OneHotEncoder()
  ohe.fit(np.array(train[['country_destination']]))
  y_true=ohe.transform(y.reshape(-1, 1))
  ndcg_scr = ndcg_score(y_true.toarray(), y_pred)  
  return ndcg_scr

In [54]:
train = pd.read_csv('train_users_2.csv')
X=train.drop('country_destination',axis=1).iloc[400:403]
y=train.iloc[400:403,-1].to_numpy()
final_fun_2(X,y)

0.7956176024115139