# Data mining techniques: Assignment 2

This environment makes use of Python version 3.6.3

In [1]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

import sys
import tqdm

# Set data variables

In [3]:
trainPath = "/home/kevin/data_mining/data/assignment2/train.csv"
testPath = "/home/kevin/data_mining/data/assignment2/test.csv"

traincols = list(pd.read_csv(trainPath,nrows=1).columns.values)
testcols = list(pd.read_csv(testPath,nrows=1).columns.values)

# Feature Engineering

## Extract relevant date_time values

In [5]:
attributes = ["date_time"]
df = pd.read_csv(trainPath,usecols=attributes)

df_dates = pd.to_datetime(df["date_time"])

train_time_df = pd.DataFrame({"date_year":df_dates.dt.year,
                        "date_month":df_dates.dt.month,
                        "date_hour":df_dates.dt.hour})

del df,df_dates

train_time_df.head()

Unnamed: 0,date_hour,date_month,date_year
0,8,4,2013
1,8,4,2013
2,8,4,2013
3,8,4,2013
4,8,4,2013


# Missing values imputation

## Origin-destination distance

In [6]:
attributes = ["visitor_location_country_id","prop_country_id","orig_destination_distance"]
df = pd.read_csv(trainPath,usecols=attributes)

max_n = max(np.max(df["visitor_location_country_id"]),np.max(df["prop_country_id"]))

mean_distances = np.empty((max_n,max_n))
mean_distances[:,:] = np.nan

dist_sum = np.zeros((max_n,max_n))
dist_count = np.zeros((max_n,max_n))

dist_nans = np.isnan(df["orig_destination_distance"])

print("Number of NaNs before:",np.sum(dist_nans))

sys.stdout.flush()

i = np.maximum(df["visitor_location_country_id"],df["prop_country_id"]) - 1
j = np.minimum(df["visitor_location_country_id"],df["prop_country_id"]) - 1

# Fill NAs with distance pairs
with tqdm.tqdm(total=np.sum(~dist_nans)) as pbar:
    for n in np.arange(len(df))[~dist_nans]:
        dist_sum[i[n],j[n]] += df.loc[n,"orig_destination_distance"]
        dist_count[i[n],j[n]] += 1
        
        pbar.update()

mean_indices = (dist_count != 0)
mean_distances[mean_indices] = dist_sum[mean_indices] / dist_count[mean_indices]

df.loc[dist_nans,"orig_destination_distance"] = mean_distances[i[dist_nans],j[dist_nans]]
    
dist_nans = np.isnan(df["orig_destination_distance"])
print("Number of NaNs after pairs:",np.sum(dist_nans))

# Fill NAs with medians per visitor location ID
visitor_medians = np.zeros(np.max(df["visitor_location_country_id"]))
vis_loc_ids = df["visitor_location_country_id"] - 1
for n in range(len(visitor_medians)):
    vals = (vis_loc_ids == n) & ~dist_nans
    nans = (vis_loc_ids == n) & dist_nans
    
    dist_median = np.median(df.loc[vals,"orig_destination_distance"])
    df.loc[nans,"orig_destination_distance"] = dist_median
    visitor_medians[n] = dist_median

dist_nans = np.isnan(df["orig_destination_distance"])
print("Number of NaNs after visitor ID imputation:",np.sum(dist_nans))
    
# Fill remaining NAs with the median of all distances
median_distance = np.median(df.loc[~dist_nans,"orig_destination_distance"])
df.loc[dist_nans,"orig_destination_distance"] = median_distance

# Store the computed training distances
train_distances = df["orig_destination_distance"]

print("Number of NaNs after:",np.sum(np.isnan(df["orig_destination_distance"])))

del df

Number of NaNs before: 1607782


100%|██████████| 3350565/3350565 [04:39<00:00, 11980.54it/s]


Number of NaNs after pairs: 471728


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Number of NaNs after visitor ID imputation: 20210
Number of NaNs after: 0


# Hotel description columns imputation values

In [7]:
attributes = ["prop_location_score2","srch_query_affinity_score"]
df = pd.read_csv(trainPath,usecols=attributes)

review_imp = 1
location_score2_imp = np.percentile(df["prop_location_score2"],25)
srch_query_affinity_imp = np.percentile(df["srch_query_affinity_score"],25)

del df

  interpolation=interpolation)


# Export data to CSV

## Construct dataframe to export

In [8]:
omitted = ["date_time","visitor_hist_starrating","visitor_hist_adr_usd","gross_bookings_usd"]
attributes = [elem for elem in traincols if elem not in omitted]

train = pd.read_csv(trainPath,usecols=attributes)

train.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,...,,,,,,0.0,0.0,,0,0
1,1,12,187,219,10404,4,4.0,1,2.2,0.0149,...,,,,,,0.0,0.0,,0,0
2,1,12,187,219,21315,3,4.5,1,2.2,0.0245,...,,,,,,0.0,0.0,,0,0
3,1,12,187,219,27348,2,4.0,1,2.83,0.0125,...,,,,,,-1.0,0.0,5.0,0,0
4,1,12,187,219,29604,4,3.5,1,2.64,0.1241,...,,,,,,0.0,0.0,,0,0


## Impute missing values

In [9]:
train["orig_destination_distance"] = train_distances

del train_distances

In [10]:
nans = np.isnan(train["prop_review_score"])
train.loc[nans,"prop_review_score"] = review_imp

In [11]:
nans = np.isnan(train["prop_location_score2"])
train.loc[nans,"prop_location_score2"] = location_score2_imp

In [12]:
nans = np.isnan(train["srch_query_affinity_score"])
train.loc[nans,"srch_query_affinity_score"] = srch_query_affinity_imp

In [13]:
for n in range(1,9):
    nans = np.isnan(train["comp%i_rate" % (n)])
    train.loc[nans,"comp%i_rate" % (n)] = 0
    
    nans = np.isnan(train["comp%i_inv" % (n)])
    train.loc[nans,"comp%i_inv" % (n)] = 0
    
    nans = np.isnan(train["comp%i_rate_percent_diff" % (n)])
    train.loc[nans,"comp%i_rate_percent_diff" % (n)] = 0

In [14]:
train.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,1,12,187,219,10404,4,4.0,1,2.2,0.0149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,1,12,187,219,21315,3,4.5,1,2.2,0.0245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,1,12,187,219,27348,2,4.0,1,2.83,0.0125,...,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,5.0,0,0
4,1,12,187,219,29604,4,3.5,1,2.64,0.1241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


## Append date attributes

In [15]:
for column in train_time_df.columns.values:
    train[column] = train_time_df[column]
    
del train_time_df

train.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,date_hour,date_month,date_year
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,...,0.0,0.0,0.0,0.0,0.0,0,0,8,4,2013
1,1,12,187,219,10404,4,4.0,1,2.2,0.0149,...,0.0,0.0,0.0,0.0,0.0,0,0,8,4,2013
2,1,12,187,219,21315,3,4.5,1,2.2,0.0245,...,0.0,0.0,0.0,0.0,0.0,0,0,8,4,2013
3,1,12,187,219,27348,2,4.0,1,2.83,0.0125,...,0.0,0.0,-1.0,0.0,5.0,0,0,8,4,2013
4,1,12,187,219,29604,4,3.5,1,2.64,0.1241,...,0.0,0.0,0.0,0.0,0.0,0,0,8,4,2013


## Export data frame to CSV file

In [28]:
train.to_csv("/home/kevin/data_mining/data/assignment2/full_train.csv",sep=',',index=False)

In [34]:
outfile = open("/home/kevin/data_mining/data/assignment2/full_train.csv")

for n in range(2):
    print(outfile.readline())
    
outfile.close()

srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,booking_bool,date_hour,date_month,date_year

1,12,187,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,,5070.248555555556,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,