# Postgresql

# Create table 'train_expedia' and import the data from csv to the table
create table train_expedia (
date_time timestamp,                 
site_name int,                
posa_continent int,           
user_location_country int,    
user_location_region int,
user_location_city int,     
orig_destination_distance double precision,
user_id int,                
is_mobile smallint,               
is_package int,              
channel int,           
srch_ci char(25),            
srch_co char(25),               
srch_adults_cnt int,         
srch_children_cnt int,         
srch_rm_cnt int,             
srch_destination_id int,       
srch_destination_type_id int,
is_booking smallint,              
cnt bigint,
hotel_continent int,          
hotel_country int,            
hotel_market int,                       
hotel_cluster int
);

COPY train_expedia from '/.../train.csv' WITH (FORMAT CSV, DELIMITER ',', HEADER);



# Create new table 'train' from train_expedia and calculate total number of bookings and clicks

create table train as SELECT srch_destination_id, hotel_cluster, 
SUM (is_booking) as bookings,COUNT(is_booking) AS clicks                                          
from train_expedia
GROUP BY srch_destination_id, hotel_cluster
order by srch_destination_id, hotel_cluster

# Create new table 'train1' from train and calculate relevance
create table train1 as select srch_destination_id, hotel_cluster,bookings,clicks, 
(sum(bookings) + 0.05*sum(clicks)) as relevance                                          
from train
GROUP BY srch_destination_id, hotel_cluster,bookings,clicks
order by srch_destination_id, hotel_cluster

# Create new table 'train2' from train1 with 3 columns we need
# This is the final table we are going to use in python

create table train2 as select srch_destination_id, hotel_cluster,relevance from train1
group by srch_destination_id, hotel_cluster,relevance
order by srch_destination_id,relevance desc

select * from train2

# Export train2 as csv to use in python

# Thanks to "dune_dweller" :)
import numpy as np
import pandas as pd

# Read the csv we created using postgresql and the test csv with 1 column
train = pd.read_csv('train2.csv')
test = pd.read_csv('test.csv',
                    dtype={'srch_destination_id':np.int32},
                    usecols=['srch_destination_id'],)

# Function to find most popular hotel clusters by destination
def most_popular(group, n_max=5):
    relevance = group['relevance'].values
    hotel_cluster = group['hotel_cluster'].values
    most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
    return np.array_str(most_popular)[1:-1]

# Get most popular hotel clusters for all destinations.
most_pop = train.groupby(['srch_destination_id']).apply(most_popular)
most_pop = pd.DataFrame(most_pop).rename(columns={0:'hotel_cluster'})
most_pop.head()

# Predict for test data
test = test.merge(most_pop, how='left',left_on='srch_destination_id',right_index=True)
test.head()

# Check for  null values
test.hotel_cluster.isnull().sum()

# Fill nas with hotel clusters that are most popular overall
most_pop_all = train.groupby('hotel_cluster')['relevance'].sum().nlargest(5).index
most_pop_all = np.array_str(most_pop_all)[1:-1]
most_pop_all

test.hotel_cluster.fillna(most_pop_all,inplace=True)
test.hotel_cluster.isnull().sum()

# Save the submission file
test.hotel_cluster.to_csv('postgresql_and_pandas.csv',header=True, index_label='id')