In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import sklearn
import sklearn.metrics

from sklearn.decomposition import PCA
from sklearn.preprocessing import label_binarize


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

### Abstract

In this Jupyter Notebook we will be demonstrating the lifecycle of the Data Science & Analysis pipeline through a Machine Learning example **(TODO: _TYPES OF ML - RANDOM FOREST, BAYES, SVM, ETC_)**. The pipeline can be broken down into five core components, each of which will be covered in this report. [The dataset we will be using is from a retired competition on Kaggle.com, courteously provided by Expedia.](https://www.kaggle.com/c/expedia-hotel-recommendations) Here is the challenge we are tasked with:
            
_Expedia is a proprietary Search Engine for Hotel & Travel bookings. Everytime a user initiates a search session, data is kept about the user's search. Data is also provided from Expedia's in-house hotel clustering algorithm. This data groups similar hotels into 100 different clusters, which simplifies our the machine learning portion of our task into a classification problem. Thus, given all this data, design a classifier to predict which hotel cluster a user will end up booking a hotel from, based on the user's search patterns._

### Data Collection

- Retrieve the test, training, and destinations datasets from [Kaggle](https://www.kaggle.com/c/expedia-hotel-recommendations/data).
    - test.csv - Test data that we will evaluate our classifiers with. Contains n = 2528243 observations, but we we're able to load all the data without sampling it (unlike the training dataset).
    - train.csv - Data about searches for hotels by users. These data are our feature vectors, complete with hotel cluster classification (labeling). Contains n = 37670293 observations. We'll need to sample these data down to a managable size.
    - destinations.csv - Hotel feature data. Contains 149 anonymized features for n = 62106 hotels.
 



In [None]:
# reads chunks of data from test.csv and makes a DataFrame for the test set

test_df = pd.DataFrame()
chunks = [test_df]

for chunk in pd.read_csv('../input/test.csv', sep=',', chunksize=1e6):
    chunks += [chunk]

test_df = pd.concat(chunks)
test_df.head()

In [None]:
# summary statistics for test data set

test_df.describe()

In [None]:
train_df = pd.DataFrame()
chunks = [train_df]

sample_size = int(1e5)
bin_num = 38
bin_sample_size = int(sample_size // bin_num)

for chunk in pd.read_csv('../input/train.csv', sep=',', chunksize=1e6):
    if sample_size - (2 * bin_sample_size) < 0:
        temp = chunk.sample(sample_size)
    else:
        temp = chunk.sample(bin_sample_size)
        
    print(len(chunks))
    
    sample_size -= bin_sample_size
    chunks += [temp]


train_df = pd.concat(chunks)
train_df.head()

In [None]:
train_df.describe()

In [None]:
dest_df = pd.DataFrame()

for chunk in pd.read_csv('../input/destinations.csv', sep=',', chunksize=1e6):
    dest_df = pd.concat([dest_df, chunk])

dest_df.describe()

### Explore the Data

**TODO:**
+ _Randomly sample from train multiple times_
+ _Sample training and testing from original training_
+ _Stuff about data & attributes, some graphs_
+ _Convert date to date time, compare test & train dates/split by date_
+ _Compare user IDs in test & train, only use user ids in train that are in test_
+ _Benchmark our classifiers with premade packages_

In [None]:
# frequency count of 5 most popular hotel clusters from training sample

freq_df = pd.DataFrame(train_df['hotel_cluster'].copy())
freq_df.columns = ['actual']


freq_predictions = train_df['hotel_cluster'].value_counts().head().index.tolist()
freq_df['freq_predict'] = [freq_predictions[0] for i in range(freq_df.shape[0])]
temp = [freq_predictions[0] for i in range(freq_df.shape[0])]
freq_df.head()


In [None]:
y = freq_df['actual'].values
y = label_binarize(y, classes=list(range(0,100)))
x = label_binarize(temp, classes=list(range(0,100)))

sklearn.metrics.average_precision_score(y, x)

#### Linear Regression

- _Check r coeff to see if any variables have interactions_

In [None]:
# check Pearson's Correlation Coefficient (r) values for every search feature against each other

rcorrs = train_df.corr()

for i, r in rcorrs.iterrows():
    for j in range(rcorrs.shape[1]):
        if np.abs(r[j]) < 0.4 or np.abs(r[j]) == 1:
            r[j] = np.NaN
    
rcorrs

#### Destinations - Principal Component Analysis

In [None]:
dest_pca = PCA(n_component=5)


In [None]:
p = list(range(1,100))
print(p)