# Data mining techniques: Assignment 2

This environment makes use of Python version 3.6.3

In [1]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

import sys
import tqdm

# Set data variables

In [2]:
trainPath = "/home/kevin/data_mining/data/assignment2/Split_Data_DM/train.csv"
validPath = "/home/kevin/data_mining/data/assignment2/Split_Data_DM/validation.csv"
testPath = "/home/kevin/data_mining/data/assignment2/Split_Data_DM/test.csv"

traincols = list(pd.read_csv(trainPath,nrows=1).columns.values)
validcols = list(pd.read_csv(validPath,nrows=1).columns.values)
testcols = list(pd.read_csv(testPath,nrows=1).columns.values)

# Feature Engineering

## Extract relevant date_time values

In [3]:
def extract_date_values(path):
    attributes = ["date_time"]
    df = pd.read_csv(path,usecols=attributes)

    df_dates = pd.to_datetime(df["date_time"])

    time_df = pd.DataFrame({"date_year":df_dates.dt.year,
                            "date_month":df_dates.dt.month,
                            "date_hour":df_dates.dt.hour})

    return time_df

def retrieve_relevance(path):
    attributes = ["booking_bool","click_bool"]
    df = pd.read_csv(path,usecols=attributes)
    
    relevance = np.zeros(len(df))
    
    relevance[df["booking_bool"].values == 1] = 5
    relevance[(df["click_bool"].values == 1) & (df["booking_bool"].values == 0)] = 1
    
    return relevance

## Export engineered full data set

In [4]:
"""
train_full = "/home/kevin/data_mining/data/assignment2/initial_train.csv"
cols = list(pd.read_csv(train_full,nrows=1).columns.values)

omitted = ["date_time"]
attributes = [elem for elem in cols if elem not in omitted]
df = pd.read_csv(train_full,usecols=attributes)

time_df = extract_date_values(train_full)

for column in time_df.columns.values:
    df[column] = time_df[column]

del time_df

relevance = retrieve_relevance(train_full)

df["relevance"] = relevance

del relevance

df.head(10)
"""

'\ntrain_full = "/home/kevin/data_mining/data/assignment2/initial_train.csv"\ncols = list(pd.read_csv(train_full,nrows=1).columns.values)\n\nomitted = ["date_time"]\nattributes = [elem for elem in cols if elem not in omitted]\ndf = pd.read_csv(train_full,usecols=attributes)\n\ntime_df = extract_date_values(train_full)\n\nfor column in time_df.columns.values:\n    df[column] = time_df[column]\n\ndel time_df\n\nrelevance = retrieve_relevance(train_full)\n\ndf["relevance"] = relevance\n\ndel relevance\n\ndf.head(10)\n'

In [5]:
#df.to_csv("/home/kevin/data_mining/data/assignment2/full_train.csv",sep=',',index=False)

# Missing values imputation

In [6]:
# Initialize missing values dictionary
imp = {}

## Origin-destination distance

In [7]:
attributes = ["visitor_location_country_id","prop_country_id","orig_destination_distance"]
df = pd.read_csv(trainPath,usecols=attributes)

max_n = max(np.max(df["visitor_location_country_id"]),np.max(df["prop_country_id"]))

mean_distances = np.empty((max_n,max_n))
mean_distances[:,:] = np.nan

dist_sum = np.zeros((max_n,max_n))
dist_count = np.zeros((max_n,max_n))

dist_nans = np.isnan(df["orig_destination_distance"])

print("Number of NaNs before:",np.sum(dist_nans))

sys.stdout.flush()

i = np.maximum(df["visitor_location_country_id"],df["prop_country_id"]) - 1
j = np.minimum(df["visitor_location_country_id"],df["prop_country_id"]) - 1

# Fill NAs with distance pairs
with tqdm.tqdm(total=np.sum(~dist_nans)) as pbar:
    for n in np.arange(len(df))[~dist_nans]:
        dist_sum[i[n],j[n]] += df.loc[n,"orig_destination_distance"]
        dist_count[i[n],j[n]] += 1
        
        pbar.update()

mean_indices = (dist_count != 0)
mean_distances[mean_indices] = dist_sum[mean_indices] / dist_count[mean_indices]

df.loc[dist_nans,"orig_destination_distance"] = mean_distances[i[dist_nans],j[dist_nans]]
    
dist_nans = np.isnan(df["orig_destination_distance"])
print("Number of NaNs after pairs:",np.sum(dist_nans))

# Fill NAs with medians per visitor location ID
visitor_medians = np.zeros(np.max(df["visitor_location_country_id"]))
visitor_medians[:] = np.nan

vis_loc_ids = df["visitor_location_country_id"] - 1
for n in range(len(visitor_medians)):
    vals = (vis_loc_ids == n) & ~dist_nans
    nans = (vis_loc_ids == n) & dist_nans
    
    dist_median = np.median(df.loc[vals,"orig_destination_distance"])
    df.loc[nans,"orig_destination_distance"] = dist_median
    visitor_medians[n] = dist_median

dist_nans = np.isnan(df["orig_destination_distance"])
print("Number of NaNs after visitor ID imputation:",np.sum(dist_nans))
    
# Fill remaining NAs with the median of all distances
median_distance = np.median(df.loc[~dist_nans,"orig_destination_distance"])
df.loc[dist_nans,"orig_destination_distance"] = median_distance

print("Number of NaNs after:",np.sum(np.isnan(df["orig_destination_distance"])))

imp["mean_distances"] = mean_distances
imp["visitor_medians"] = visitor_medians
imp["median_distance"] = median_distance

del df

Number of NaNs before: 1127012


100%|██████████| 2343823/2343823 [03:10<00:00, 12299.85it/s]


Number of NaNs after pairs: 331951


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Number of NaNs after visitor ID imputation: 15353
Number of NaNs after: 0


# Hotel description columns imputation values

In [8]:
attributes = ["prop_location_score2","srch_query_affinity_score"]
df = pd.read_csv(trainPath,usecols=attributes)

imp["prop_review_score"] = 0
imp["comp"] = 0

nans = np.isnan(df["prop_location_score2"])
imp["prop_location_score2"] = np.percentile(df.loc[~nans,"prop_location_score2"],25)

nans = np.isnan(df["srch_query_affinity_score"])
imp["srch_query_affinity_score"] = np.percentile(df.loc[~nans,"srch_query_affinity_score"],25)

del df

# Export train data to CSV

In [9]:
def construct_full_dataset(path,cols,omitted,imp):
    attributes = [elem for elem in cols if elem not in omitted]
    df = pd.read_csv(path,usecols=attributes)

    # Impute origin-distance values

    dist_nans = np.isnan(df["orig_destination_distance"])
    i = np.maximum(df["visitor_location_country_id"],df["prop_country_id"]) - 1
    j = np.minimum(df["visitor_location_country_id"],df["prop_country_id"]) - 1
    df.loc[dist_nans,"orig_destination_distance"] = imp["mean_distances"][i[dist_nans],j[dist_nans]]
    
    dist_nans = np.isnan(df["orig_destination_distance"])
    vis_loc_ids = df["visitor_location_country_id"] - 1
    for n in range(len(visitor_medians)):
        nans = (vis_loc_ids == n) & dist_nans
        df.loc[nans,"orig_destination_distance"] = imp["visitor_medians"][n]
    
    dist_nans = np.isnan(df["orig_destination_distance"])
    df.loc[dist_nans,"orig_destination_distance"] = imp["median_distance"]
    
    # Impute the rest of the missing values
    
    nans = np.isnan(df["prop_review_score"])
    df.loc[nans,"prop_review_score"] = imp["prop_review_score"]

    nans = np.isnan(df["prop_location_score2"])
    df.loc[nans,"prop_location_score2"] = imp["prop_location_score2"]

    nans = np.isnan(df["srch_query_affinity_score"])
    df.loc[nans,"srch_query_affinity_score"] = imp["srch_query_affinity_score"]

    for n in range(1,9):
        nans = np.isnan(df["comp%i_rate" % (n)])
        df.loc[nans,"comp%i_rate" % (n)] = imp["comp"]

        nans = np.isnan(df["comp%i_inv" % (n)])
        df.loc[nans,"comp%i_inv" % (n)] = imp["comp"]

        nans = np.isnan(df["comp%i_rate_percent_diff" % (n)])
        df.loc[nans,"comp%i_rate_percent_diff" % (n)] = imp["comp"]

    # Extract and append relevant date attributes
    #time_df = extract_date_values(path)

    #for column in time_df.columns.values:
    #    df[column] = time_df[column]

    #del time_df

    return df

# Export data to CSV

## Train data

In [10]:
omitted = []
train = construct_full_dataset(trainPath,traincols,omitted,imp)

print("NaNs: %i" % np.sum(train.isnull().values))

train.head()

NaNs: 9962291


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,date_hour,date_month,date_year,relevance,ignored_bool
0,49766,5,219,,,219,6642,3,4.0,0,...,0.0,0.0,0,,0,14,6,2013,0,1
1,49766,5,219,,,219,10827,0,4.0,1,...,0.0,0.0,0,,0,14,6,2013,0,1
2,49766,5,219,,,219,21167,2,3.5,0,...,0.0,0.0,0,,0,14,6,2013,0,1
3,49766,5,219,,,219,33591,4,4.0,1,...,0.0,0.0,0,,0,14,6,2013,0,1
4,49766,5,219,,,219,45731,3,4.5,0,...,0.0,0.0,0,,0,14,6,2013,0,1


In [11]:
train.to_csv("/home/kevin/data_mining/data/assignment2/Split_Data_DM/mod_train.csv",sep=',',index=False)

## Validation data

In [12]:
omitted = []
valid = construct_full_dataset(validPath,validcols,omitted,imp)

print("NaNs: %i" % np.sum(valid.isnull().values))

valid.head()

NaNs: 2134884


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,date_hour,date_month,date_year,relevance,ignored_bool
0,49766,5,219,,,219,43888,3,4.0,1,...,0.0,0.0,0,,0,14,6,2013,0,1
1,49766,5,219,,,219,123391,4,5.0,1,...,0.0,0.0,0,,0,14,6,2013,0,1
2,49768,5,219,,,219,10162,3,5.0,1,...,0.0,0.0,0,,0,15,6,2013,0,1
3,49768,5,219,,,219,76560,3,3.5,1,...,0.0,0.0,0,,0,15,6,2013,0,1
4,49768,5,219,,,219,98616,4,4.0,1,...,0.0,0.0,0,,0,15,6,2013,0,1


In [13]:
valid.to_csv("/home/kevin/data_mining/data/assignment2/Split_Data_DM/mod_valid.csv",sep=',',index=False)

## Test data

In [14]:
omitted = []
test = construct_full_dataset(testPath,testcols,omitted,imp)

print("NaNs: %i" % np.sum(test.isnull().values))

test.head()

NaNs: 2134622


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,date_hour,date_month,date_year,relevance,ignored_bool
0,1,12,187,,,219,893,3,3.5,1,...,0.0,0.0,0,,0,8,4,2013,0,1
1,1,12,187,,,219,10404,4,4.0,1,...,0.0,0.0,0,,0,8,4,2013,0,1
2,1,12,187,,,219,21315,3,4.5,1,...,0.0,0.0,0,,0,8,4,2013,0,1
3,1,12,187,,,219,27348,2,4.0,1,...,0.0,5.0,0,,0,8,4,2013,0,1
4,1,12,187,,,219,29604,4,3.5,1,...,0.0,0.0,0,,0,8,4,2013,0,1


In [15]:
test.to_csv("/home/kevin/data_mining/data/assignment2/Split_Data_DM/mod_test.csv",sep=',',index=False)