# Data mining techniques: Assignment 2

This environment makes use of Python version 3.6.3

In [1]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

import sys
import tqdm

# Set data variables

In [2]:
trainPath = "/home/kevin/data_mining/data/assignment2/Split_Data_DM/train.csv"
validPath = "/home/kevin/data_mining/data/assignment2/Split_Data_DM/validation.csv"
testPath = "/home/kevin/data_mining/data/assignment2/Split_Data_DM/test.csv"

traincols = list(pd.read_csv(trainPath,nrows=1).columns.values)
validcols = list(pd.read_csv(validPath,nrows=1).columns.values)
testcols = list(pd.read_csv(testPath,nrows=1).columns.values)

# Feature Engineering

## Extract relevant date_time values

In [3]:
def extract_date_values(path):
    attributes = ["date_time"]
    df = pd.read_csv(path,usecols=attributes)

    df_dates = pd.to_datetime(df["date_time"])

    time_df = pd.DataFrame({"date_year":df_dates.dt.year,
                            "date_month":df_dates.dt.month,
                            "date_hour":df_dates.dt.hour})

    return time_df

# Missing values imputation

In [4]:
# Initialize missing values dictionary
imp = {}

## Origin-destination distance

In [5]:
attributes = ["visitor_location_country_id","prop_country_id","orig_destination_distance"]
df = pd.read_csv(trainPath,usecols=attributes)

max_n = max(np.max(df["visitor_location_country_id"]),np.max(df["prop_country_id"]))

mean_distances = np.empty((max_n,max_n))
mean_distances[:,:] = np.nan

dist_sum = np.zeros((max_n,max_n))
dist_count = np.zeros((max_n,max_n))

dist_nans = np.isnan(df["orig_destination_distance"])

print("Number of NaNs before:",np.sum(dist_nans))

sys.stdout.flush()

i = np.maximum(df["visitor_location_country_id"],df["prop_country_id"]) - 1
j = np.minimum(df["visitor_location_country_id"],df["prop_country_id"]) - 1

# Fill NAs with distance pairs
with tqdm.tqdm(total=np.sum(~dist_nans)) as pbar:
    for n in np.arange(len(df))[~dist_nans]:
        dist_sum[i[n],j[n]] += df.loc[n,"orig_destination_distance"]
        dist_count[i[n],j[n]] += 1
        
        pbar.update()

mean_indices = (dist_count != 0)
mean_distances[mean_indices] = dist_sum[mean_indices] / dist_count[mean_indices]

df.loc[dist_nans,"orig_destination_distance"] = mean_distances[i[dist_nans],j[dist_nans]]
    
dist_nans = np.isnan(df["orig_destination_distance"])
print("Number of NaNs after pairs:",np.sum(dist_nans))

# Fill NAs with medians per visitor location ID
visitor_medians = np.zeros(np.max(df["visitor_location_country_id"]))
visitor_medians[:] = np.nan

vis_loc_ids = df["visitor_location_country_id"] - 1
for n in range(len(visitor_medians)):
    vals = (vis_loc_ids == n) & ~dist_nans
    nans = (vis_loc_ids == n) & dist_nans
    
    dist_median = np.median(df.loc[vals,"orig_destination_distance"])
    df.loc[nans,"orig_destination_distance"] = dist_median
    visitor_medians[n] = dist_median

dist_nans = np.isnan(df["orig_destination_distance"])
print("Number of NaNs after visitor ID imputation:",np.sum(dist_nans))
    
# Fill remaining NAs with the median of all distances
median_distance = np.median(df.loc[~dist_nans,"orig_destination_distance"])
df.loc[dist_nans,"orig_destination_distance"] = median_distance

print("Number of NaNs after:",np.sum(np.isnan(df["orig_destination_distance"])))

imp["mean_distances"] = mean_distances
imp["visitor_medians"] = visitor_medians
imp["median_distance"] = median_distance

del df

Number of NaNs before: 1126222


100%|██████████| 2344622/2344622 [03:17<00:00, 11856.97it/s]


Number of NaNs after pairs: 331833


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Number of NaNs after visitor ID imputation: 15296
Number of NaNs after: 0


# Hotel description columns imputation values

In [6]:
attributes = ["prop_location_score2","srch_query_affinity_score"]
df = pd.read_csv(trainPath,usecols=attributes)

imp["prop_review_score"] = 1
imp["comp"] = 0

nans = np.isnan(df["prop_location_score2"])
imp["prop_location_score2"] = np.percentile(df.loc[~nans,"prop_location_score2"],25)

nans = np.isnan(df["srch_query_affinity_score"])
imp["srch_query_affinity_score"] = np.percentile(df.loc[~nans,"srch_query_affinity_score"],25)

del df

# Export train data to CSV

In [7]:
def construct_full_dataset(path,cols,omitted,imp):
    attributes = [elem for elem in cols if elem not in omitted]
    df = pd.read_csv(path,usecols=attributes)

    # Impute origin-distance values

    dist_nans = np.isnan(df["orig_destination_distance"])
    i = np.maximum(df["visitor_location_country_id"],df["prop_country_id"]) - 1
    j = np.minimum(df["visitor_location_country_id"],df["prop_country_id"]) - 1
    df.loc[dist_nans,"orig_destination_distance"] = imp["mean_distances"][i[dist_nans],j[dist_nans]]
    
    dist_nans = np.isnan(df["orig_destination_distance"])
    vis_loc_ids = df["visitor_location_country_id"] - 1
    for n in range(len(visitor_medians)):
        nans = (vis_loc_ids == n) & dist_nans
        df.loc[nans,"orig_destination_distance"] = imp["visitor_medians"][n]
    
    dist_nans = np.isnan(df["orig_destination_distance"])
    df.loc[dist_nans,"orig_destination_distance"] = imp["median_distance"]
    
    # Impute the rest of the missing values
    
    nans = np.isnan(df["prop_review_score"])
    df.loc[nans,"prop_review_score"] = imp["prop_review_score"]

    nans = np.isnan(df["prop_location_score2"])
    df.loc[nans,"prop_location_score2"] = imp["prop_location_score2"]

    nans = np.isnan(df["srch_query_affinity_score"])
    df.loc[nans,"srch_query_affinity_score"] = imp["srch_query_affinity_score"]

    for n in range(1,9):
        nans = np.isnan(df["comp%i_rate" % (n)])
        df.loc[nans,"comp%i_rate" % (n)] = imp["comp"]

        nans = np.isnan(df["comp%i_inv" % (n)])
        df.loc[nans,"comp%i_inv" % (n)] = imp["comp"]

        nans = np.isnan(df["comp%i_rate_percent_diff" % (n)])
        df.loc[nans,"comp%i_rate_percent_diff" % (n)] = imp["comp"]

    # Extract and append relevant date attributes
    time_df = extract_date_values(path)

    for column in time_df.columns.values:
        df[column] = time_df[column]

    del time_df

    return df

## Construct dataframe to export

In [8]:
"""
# Load dataframe

omitted = ["date_time","visitor_hist_starrating","visitor_hist_adr_usd","gross_bookings_usd"]
attributes = [elem for elem in traincols if elem not in omitted]

train = pd.read_csv(trainPath,usecols=attributes)

# Impute missing values

train["orig_destination_distance"] = train_distances

nans = np.isnan(train["prop_review_score"])
train.loc[nans,"prop_review_score"] = review_imp

nans = np.isnan(train["prop_location_score2"])
train.loc[nans,"prop_location_score2"] = location_score2_imp

nans = np.isnan(train["srch_query_affinity_score"])
train.loc[nans,"srch_query_affinity_score"] = srch_query_affinity_imp

for n in range(1,9):
    nans = np.isnan(train["comp%i_rate" % (n)])
    train.loc[nans,"comp%i_rate" % (n)] = comp_imp
    
    nans = np.isnan(train["comp%i_inv" % (n)])
    train.loc[nans,"comp%i_inv" % (n)] = comp_imp
    
    nans = np.isnan(train["comp%i_rate_percent_diff" % (n)])
    train.loc[nans,"comp%i_rate_percent_diff" % (n)] = comp_imp

# Extract and append relevant date attributes

train_time_df = extract_date_values(path)

for column in train_time_df.columns.values:
    train[column] = train_time_df[column]
    
del train_time_df

train.head()
"""

'\n# Load dataframe\n\nomitted = ["date_time","visitor_hist_starrating","visitor_hist_adr_usd","gross_bookings_usd"]\nattributes = [elem for elem in traincols if elem not in omitted]\n\ntrain = pd.read_csv(trainPath,usecols=attributes)\n\n# Impute missing values\n\ntrain["orig_destination_distance"] = train_distances\n\nnans = np.isnan(train["prop_review_score"])\ntrain.loc[nans,"prop_review_score"] = review_imp\n\nnans = np.isnan(train["prop_location_score2"])\ntrain.loc[nans,"prop_location_score2"] = location_score2_imp\n\nnans = np.isnan(train["srch_query_affinity_score"])\ntrain.loc[nans,"srch_query_affinity_score"] = srch_query_affinity_imp\n\nfor n in range(1,9):\n    nans = np.isnan(train["comp%i_rate" % (n)])\n    train.loc[nans,"comp%i_rate" % (n)] = comp_imp\n    \n    nans = np.isnan(train["comp%i_inv" % (n)])\n    train.loc[nans,"comp%i_inv" % (n)] = comp_imp\n    \n    nans = np.isnan(train["comp%i_rate_percent_diff" % (n)])\n    train.loc[nans,"comp%i_rate_percent_diff" %

In [9]:
#train.to_csv("/home/kevin/data_mining/data/assignment2/full_train.csv",sep=',',index=False)

#del train

# Export data to CSV

## Train data

In [10]:
omitted = ["date_time","visitor_hist_starrating","visitor_hist_adr_usd","gross_bookings_usd"]
train = construct_full_dataset(trainPath,traincols,omitted,imp)

print("NaNs: %i" % np.sum(train.isnull().values))

train.head()

NaNs: 0


Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,ignored_bool,relevance,date_hour,date_month,date_year
0,49765,5,219,219,107500,4,4.5,0,2.56,0.0222,...,0.0,0.0,0.0,0,0,1,0,14,6,2013
1,49765,5,219,219,109533,3,4.0,0,3.43,0.1024,...,0.0,0.0,0.0,0,0,1,0,14,6,2013
2,49765,5,219,219,114103,4,3.5,1,3.95,0.0646,...,0.0,0.0,0.0,0,0,1,0,14,6,2013
3,49765,5,219,219,116661,3,3.0,0,3.53,0.0998,...,0.0,0.0,0.0,0,0,1,0,14,6,2013
4,49765,5,219,219,119030,2,2.5,0,4.03,0.1256,...,0.0,0.0,0.0,0,0,1,0,14,6,2013


In [11]:
train.to_csv("/home/kevin/data_mining/data/assignment2/Split_Data_DM/mod_train.csv",sep=',',index=False)

## Validation data

In [12]:
omitted = ["date_time","visitor_hist_starrating","visitor_hist_adr_usd","gross_bookings_usd"]
valid = construct_full_dataset(validPath,validcols,omitted,imp)

print("NaNs: %i" % np.sum(valid.isnull().values))

valid.head()

NaNs: 0


Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,ignored_bool,relevance,date_hour,date_month,date_year
0,49765,5,219,219,107528,3,3.5,0,2.77,0.0061,...,0.0,0.0,0.0,0,0,1,0,14,6,2013
1,49765,5,219,219,113016,3,3.5,0,3.22,0.0103,...,0.0,0.0,0.0,0,0,1,0,14,6,2013
2,49765,5,219,219,120827,3,3.5,0,4.03,0.0969,...,0.0,0.0,0.0,0,0,1,0,14,6,2013
3,49765,5,219,219,135415,3,4.0,0,3.97,0.0833,...,0.0,0.0,0.0,1,1,0,5,14,6,2013
4,49766,5,219,219,21167,2,3.5,0,2.89,0.514,...,0.0,0.0,0.0,0,0,1,0,14,6,2013


In [13]:
valid.to_csv("/home/kevin/data_mining/data/assignment2/Split_Data_DM/mod_valid.csv",sep=',',index=False)

## Test data

In [14]:
omitted = ["date_time","visitor_hist_starrating","visitor_hist_adr_usd","gross_bookings_usd"]
test = construct_full_dataset(testPath,testcols,omitted,imp)

print("NaNs: %i" % np.sum(test.isnull().values))

test.head()

NaNs: 0


Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,ignored_bool,relevance,date_hour,date_month,date_year
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,...,0.0,0.0,0.0,0,0,1,0,8,4,2013
1,1,12,187,219,10404,4,4.0,1,2.2,0.0149,...,0.0,0.0,0.0,0,0,1,0,8,4,2013
2,1,12,187,219,21315,3,4.5,1,2.2,0.0245,...,0.0,0.0,0.0,0,0,1,0,8,4,2013
3,1,12,187,219,27348,2,4.0,1,2.83,0.0125,...,-1.0,0.0,5.0,0,0,1,0,8,4,2013
4,1,12,187,219,29604,4,3.5,1,2.64,0.1241,...,0.0,0.0,0.0,0,0,1,0,8,4,2013


In [15]:
test.to_csv("/home/kevin/data_mining/data/assignment2/Split_Data_DM/mod_test.csv",sep=',',index=False)