In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

travel_data = pd.read_csv('Kaggle_Data/tripadvisor_review_from_uci.csv')

travel_data.head()

In [None]:
travel_data.isna().sum()


In [None]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [None]:
# correl = subset_data.corr()
correl = travel_data.corr()

trace = go.Heatmap(z=correl.values,
                  x=correl.index.values,
                  y=correl.columns.values)
data=[trace]
plotly.offline.iplot(data, filename='basic-heatmap')

In [None]:
travel_data.columns

In [None]:
cols = ['Church', 'Resort', 'Beach', 'Park', 'Theatre', 'Museum',
       'Zoo', 'Restaurant', 'Pubs/bars'] # Removing 'Mall' as it has too small stdev
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
subset_data = pd.DataFrame(sc.fit_transform(travel_data[cols]), columns = travel_data[cols].columns, index = travel_data.index)

In [None]:
subset_data.head()

In [None]:
# Check if PCA using SVD gives same results
from sklearn.decomposition import PCA
# initializing the PCA transformer
pca = PCA(n_components = 2)
# pca = PCA(n_components = 3)
# dimensionality reduction:
data_pca = pd.DataFrame(pca.fit_transform(subset_data), index = subset_data.index)

In [None]:
plt.plot(data_pca[0],data_pca[1], 'ro', alpha = 0.5)
for i in range(len(data_pca)):
    plt.text(data_pca.loc[i][0], data_pca.loc[i][1], str(i)) # cereal_data['name']) # 

plt.show()

In [None]:
data_pca

In [None]:
# own data 

our_data = pd.read_csv('Kaggle_Data/own_trip_advisor_avg.csv')

# Basic problem is our mall value was too small compared to the other data points (since its stdev was low)
# so decided to drop 'Mall'

our_data.head()

In [None]:
test_data = pd.DataFrame(sc.transform(our_data[cols]), columns = our_data[cols].columns, index = our_data.index)
test_data.head()

In [None]:
our_pca = pd.DataFrame(pca.transform(test_data), index = test_data.index)
our_pca.head()

In [None]:
data_pca['dist'] = ((data_pca[0]-our_pca[0].values)**2+(data_pca[1]-our_pca[1].values)**2)**0.5
# data_pca['dist'] = ((data_pca[0]-our_pca[0].values)**2+(data_pca[1]-our_pca[1].values)**2+(data_pca[2]-our_pca[2].values)**2)**0.5

data_pca

In [None]:
data_pca['dist'].min()
np.argmin(data_pca['dist'])

In [None]:
data_pca.loc[829]

In [None]:
data_pca.loc[795]

In [None]:
travel_data[cols + ['Mall']].mean()
travel_data[cols + ['Mall']].std()