## EXPLORATORY DATA ANALYSIS

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from scipy.stats import t
import xgboost as xgb
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.preprocessing import binarize

In [None]:
df = pd.read_csv('tourism.csv', encoding = 'unicode-escape', dtype = None, low_memory = False)

In [None]:
df.head()

In [None]:
df = df.replace(['January-March', 'April-June', 'July-September', 'October-December'], ['Q1', 'Q2', 'Q3', 'Q4'])

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.info(memory_usage='deep')

In [None]:
df.nunique()

In [None]:
 df[['quarter', 'dur_stay', 'mode', 'mode_detail', 'purpose', 'sex', 'age', 'package']] =  df[['quarter', 'dur_stay', 'mode', 'mode_detail', 'purpose', 'sex', 'age', 'package']].astype('category')

In [None]:
df.year.unique()

In [None]:
df.year.replace('2019P', '2019', inplace = True)

In [None]:
df['year'] = df['year'].astype('int64')

In [None]:
df.info(memory_usage='deep')

In [None]:
col_names = {'Visits (000s)': 'visits', 'Spend (£m)': 'spend', 'Nights (000s)': 'nights'}
df = df.rename(columns = col_names)

In [None]:
df.quarter.unique()

In [None]:
df.head()

In [None]:
df['per_tourist'] = df.spend / df.visits
df.per_tourist.fillna(0, inplace = True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head()

1. What was the year with highest number of tourists?
2. Which market had the highest total quarterly visits to London?
3. What is the most popular mode of travel in 2017 for tourists from France?
4. Which market was the source of the most visitss for Business purpose in 2017?
5. What are the portions of visits according to purpose during each year?
6. Which market had the highest number of visitors traveled by the sea?
7. What age group traveled to London the most in 2014-2018 period?
8. is there any relationship between purpose of visit and money spent in UK, what graphs could best explain this relationship?
9. Is there any significant difference using Air transportation beween female and male?
10. Explain overnight visits for a decade between 2002-2012 based on purpose of visit, which grapph do you think best illustrate the case.



## What was the year with highest number of tourists?

In [None]:
yearly_visits = df.groupby('year').visits.sum()
top5_yearly = yearly_visits.sort_values(ascending = False).head(5)

sns.set()
_ = plt.figure(figsize = (8, 5))
_ = plt.bar(top5_yearly.index, top5_yearly.values, width = 0.6, color=('brown', 'pink', 'pink', 'pink', 'pink'))
_ = plt.xlabel('Years')
_ = plt.ylabel('Number of Visitors (in 000s)')
_ = plt.title('Years with Highest Number of Visitors')

In [None]:
df.info()

## Which market had the highest total quarterly visits to London?

In [None]:
quarterly = df.groupby(['quarter', 'market']).visits.sum().unstack()

countries = pd.Series(quarterly.idxmax(axis = 1), name = 'Country')
visits = pd.Series(quarterly.max(axis = 1), name = 'Visits')

quarterly = pd.concat([countries, visits], axis = 1).reset_index()
quarterly['QCountry'] = quarterly.quarter.astype(str) + ' ' + quarterly.Country.astype(str)
quarterly.drop(columns = ['quarter', 'Country'], inplace = True)

_ = plt.figure(figsize = (8, 5))
_ = plt.bar(quarterly.QCountry, quarterly.Visits, color = 'brown', width = 0.6)
_ = plt.xlabel('Countries')
_ = plt.ylabel('Number of Visitors')
_ = plt.title('Countries with Highest Highest Quarterly Visits')

## What is the most popular mode of travel in 2017 for tourists from France?

In [None]:
#Question3
grouped = df[(df['year'] == 2017) & (df['market'] == 'France')].groupby('mode').visits.sum()
sorted_group = pd.DataFrame(grouped.sort_values(ascending = False))

my_explode = (0, 0, 0)
my_colors = ['lightblue','lightsteelblue','silver']

_ = plt.pie(sorted_group['visits'], labels = ['Air', 'Tunnel', 'Sea'], autopct='%1.1f%%', radius = 1.5, explode = my_explode, shadow = True, colors = my_colors)

_ = plt.title('Modes of Visits')
_ = plt.axis('equal')

## Which market was the source of the most visits for Business purpose in 2017?

In [None]:
business2017 = df[(df.purpose == 'Business') & (df.year == 2017)]
business2017_market = pd.DataFrame(business2017.groupby(['market', 'sex'])['visits'].sum().unstack(), columns = ['Female', 'Male', 'NA'])

business2017_market['Total'] = business2017_market.Female + business2017_market.Male
business2017_market.drop(columns = 'NA', inplace = True)
business2017_top5market = business2017_market.sort_values(by = 'Total', ascending = False).head()

_ = plt.figure(figsize = (8, 5))
_ = plt.bar(business2017_top5market.index, business2017_top5market.Total, color = 'darkblue')
_ = plt.bar(business2017_top5market.index, business2017_top5market['Female'], color = 'red')

## What are the portions of visits according to purpose during each year?

In [None]:
#Question 5
purpose_by_year = pd.pivot_table(df, values = 'visits', index = 'year', columns = 'purpose', 
                                 aggfunc = np.sum, margins = True)

for column in purpose_by_year.columns:
    purpose_by_year[column] = ((purpose_by_year[column]/purpose_by_year['All'])*100).round(1)
    
purpose_by_year.drop('All', axis = 1, inplace = True)
purpose_by_year.drop('All', axis = 0, inplace = True)

purpose_by_year


In [None]:
_ = plt.figure(figsize = (12, 10))

for column in purpose_by_year.columns:
    
    _ = plt.plot(purpose_by_year.index, purpose_by_year[column], marker='o')
    
_ = plt.xlabel('Year')
_ = plt.ylabel('Proportions of Visitors')
_ = plt.title('% of Visitors(#) During 2002-2019')
_ = plt.legend(('VFR', 'Holiday', 'Miscellaneous', 'Study', 'Transit', 'Business'))
plt.show()


In [None]:
_ = plt.figure(figsize = (12, 10))


for column in ['Business', 'Holiday', 'VFR']:
    
    _ = plt.plot(purpose_by_year.index, purpose_by_year[column], marker='o')

_ = plt.xlabel('Year')
_ = plt.ylabel('Proportions of Visitors')
_ = plt.title('% of Visitors(#) During 2002-2019')
_ = plt.legend(('VFR', 'Holiday', 'Business'))
plt.show()

## Which market had the highest number of visitors traveled by the sea?

In [None]:
sea_visit = df[df['mode'] == 'Sea'].groupby('market').visits.sum().sort_values(ascending = False).head()

_ = plt.figure(figsize = (8, 5))
_ = plt.bar(sea_visit.index, sea_visit.values, color = 'blue')
_ = plt.xlabel('Countries')
_ = plt.ylabel('Number of Visitors')
_ = plt.title('Visits by Sea')
plt.show()

## What age group traveled to London the most in 2014-2018 period?

In [None]:
travel14_18 = df[(df['year'] > 2013) & (df['year'] < 2019)]
age14_18 = travel14_18.groupby('age').visits.sum()

_ = plt.figure(figsize = (8, 5))
_ = plt.bar(age14_18.index, age14_18.values, color = 'darkblue')
_ = plt.xlabel('Age Groups')
_ = plt.ylabel('Number of Visitors (in 000s)')
_ = plt.title('Visits by Age Groups')
plt.show()

In [None]:
spent_by_purpose = df.groupby('purpose').spend.sum()
visits_by_purpose = df.groupby('purpose').visits.sum()

fig, axs = plt.subplots(1, 2, figsize = (14, 6))
axs[0].bar(spent_by_purpose.index, spent_by_purpose.values, color = 'brown')
axs[1].bar(visits_by_purpose.index, visits_by_purpose.values, color = 'brown')

## STATISTICAL ANALYSIS

In [None]:
data = df['spend']
stat, p = shapiro(data)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

In [None]:
mean_val = np.mean(df['spend'])
std = np.std(df['spend'])

degree_f = len(df['spend']) - 1
critical = t.ppf(0.05, degree_f)
SE = std/np.sqrt(len(df['spend']))
up_lim = mean_val + (critical * SE)
low_lim = mean_val - (critical * SE)
print(up_lim)
print(low_lim)

H0: Males and females spend the same

Ha: Males and females spend different 

In [None]:
sample0 = df[df['sex'] == 'Male'].spend
sample1 = df[df['sex'] == 'Female'].spend
t, p = ttest_ind(sample0, sample1)

print("p_value is: ", p)
if p<0.05:
    print('reject null hypothesis')
else:
    print('accept null hypothesis')

H0: Different age groups spend the same

Ha: Different age groups spend different

In [None]:
#ANOVA test
df_anova = df[['age','spend']]
ages = pd.unique(df_anova.age.values)
d_data = {age:df_anova['spend'][df_anova.age == age] for age in ages}
 
F, p = stats.f_oneway(d_data['0 -15'], d_data['16-24'], 
                      d_data['25-34'], d_data['35-44'], d_data['45-54'], 
                      d_data['55-64'], d_data['65+'],)
print("p-value for significance is: ", p)
if p<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

### CLASSIFICATION

#### XG BOOST

In [None]:
df.groupby('market').spend.sum().sort_values(ascending = False).head(3)

In [None]:
countries_of_interest = ['USA', 'Germany', 'France']
model_data = df[df.market.isin(countries_of_interest)]

In [None]:
model_data.shape

In [None]:
model_data = model_data.drop(columns = ['market','sample', 'per_tourist', 'mode'])
model_data.reset_index(inplace = True, drop = True)
model_data['year'] = model_data['year'].astype(str)

In [None]:
model_data.head()

In [None]:
model_data.info()

In [None]:
y.value_counts(normalize=True)

In [None]:
classes = {'Independent': 0, 'Non-Independent': 1}
model_data.replace(classes, inplace = True)

In [None]:
model_data['package'] = model_data['package'].astype('category')

In [None]:
X = model_data.drop(columns = 'package')
X = pd.get_dummies(X)

y = model_data.package

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

In [None]:
clf = xgb.XGBClassifier()

In [None]:
param_grid = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

In [None]:
xgb_random = RandomizedSearchCV(estimator = clf, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs=-1)

In [None]:
xgb_random.fit(X_train, y_train)

In [None]:
pred = xgb_random.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
xgb_random.best_params_

In [None]:
matrix = confusion_matrix(y_test, pred)
matrix

In [None]:
y_test.shape

In [None]:
TP = matrix[1,1]
TN = matrix[0,0]
FP = matrix[0,1]
FN = matrix[1,0]

In [None]:
Image(filename= "/Users/azarnajafli/Pictures/Screenshots/Screen Shot 2020-05-11 at 2.02.16 AM.png", width = 500, height = 500)

In [None]:
precision = metrics.precision_score(y_test, pred)
print(precision)

In [None]:
specifity = TN/(TN + FP)
print(specifity)

In [None]:
recall = metrics.recall_score(y_test, pred)
print(recall)

In [None]:
y_pred_prob = xgb_random.predict_proba(X_test)[:, 1]

In [None]:
fpr, tpr, threshold = metrics.roc_curve(y_test, pred)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
probs_y=xgb_random.predict_proba(X_test) 

from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, probs_y[:, 
1]) 

pr_auc = metrics.auc(recall, precision)

plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[: -1], "b--", label="Precision")
plt.plot(thresholds, recall[: -1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])

In [None]:
y_pred_prob

In [None]:
y_pred_prob = y_pred_prob.reshape(-1, 1)

In [None]:
y_pred_class = binarize(y_pred_prob, 0.3)

In [None]:
y_pred_class

In [None]:
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
metrics.recall_score(y_test, y_pred_class)

In [None]:
metrics.precision_score(y_test, y_pred_class)

In [None]:
accuracy_score(y_test, pred)

### RANDOM FOREST

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]



# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

In [None]:
clf_rf = RandomForestClassifier()

In [None]:
rf_random = RandomizedSearchCV(estimator = clf_rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs=-1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
preds_rf = rf_random.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, preds_rf)