#Preparation

#####Importing libraries and mounting Google Drive

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from datetime import timedelta
import random
from sklearn.metrics import accuracy_score
import csv

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


#####Loading the dataset as Panda DataFrame

In [2]:
raw_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/FlickrCrawling/Sarbagita/CSV/Clustering Result/2009-2018_Clustered using DBSCAN (0.4, 1800).csv')

# changing the dates_taken format to datetime
raw_df['dates_taken'] = pd.to_datetime(raw_df['dates_taken'])
print("Shape of the DataFrame: ", raw_df.shape)
raw_df.head()

Shape of the DataFrame:  (32331, 8)


Unnamed: 0,photo_id,Source.Name,owner_nsid,owner_location,dates_taken,location_latitude,location_longitude,cluster
0,6763436949,2009_dirty.csv,21779304@N06,AUSTRALIA,2009-10-01 12:10:00,-8.647217,115.127716,0
1,6762697459,2009_dirty.csv,21779304@N06,AUSTRALIA,2009-10-01 12:11:00,-8.647217,115.127716,0
2,6762697655,2009_dirty.csv,21779304@N06,AUSTRALIA,2009-10-01 12:14:00,-8.647217,115.127716,0
3,6762698495,2009_dirty.csv,21779304@N06,AUSTRALIA,2009-10-01 12:15:00,-8.647217,115.127716,0
4,6762698685,2009_dirty.csv,21779304@N06,AUSTRALIA,2009-10-01 12:16:00,-8.647217,115.127716,0


#####Data Preparation

In [5]:
# delete outlier that doesn't belong in any cluster
clean_df = raw_df[raw_df.cluster != -1] 
print("Shape of dataframe after deleting outlier that doesn't belong in any cluster:", clean_df.shape)

# sorting the data by owner_nsid and dates_taken
clean_df = clean_df.sort_values(['owner_nsid', 'dates_taken'], ascending=[True, True])
clean_df = clean_df.reset_index(drop=True) # reseting the index

# deleting data from photos taken with timedelta less than defined dwell time
clean_df["check"] = ""
min_dwell_time = timedelta(hours = 1, minutes = 20)

for i, row in clean_df.iloc[1:].iterrows():
	check_val = True
	if clean_df.owner_nsid[i] == clean_df.owner_nsid[i-1] and clean_df.cluster[i] == clean_df.cluster[i-1] and clean_df.dates_taken[i] - clean_df.dates_taken[i-1] < min_dwell_time:
		check_val = False
	clean_df.at[i, 'check'] = check_val

clean_df = clean_df[clean_df.check != False]
clean_df = clean_df.drop('check', axis=1)
print("Shape of dataframe after removing photo taken at similar time:", clean_df.shape)

# delete data from owner_nsid who only have one data
byowner = clean_df.groupby('owner_nsid')
clean_df = byowner.filter(lambda x: len(x) > 1)
clean_df = clean_df.reset_index(drop=True) # reseting the index
print("Shape of dataframe after deleting data from owner_nsid who only have one photo:", clean_df.shape)

Shape of dataframe after deleting outlier that doesn't belong in any cluster: (30352, 8)
Shape of dataframe after removing photo taken at similar time: (6852, 8)
Shape of dataframe after deleting data from owner_nsid who only have one photo: (6082, 8)


#Markov Chain

#####Creating transition list

In [6]:
max_dwell_time = timedelta(days = 3)
trip_name = 1
clean_df['trip'] = ''
transition_list = []

# identifying trip by finding photo taken between less than 4 days
for i, row in clean_df.iloc[1:].iterrows():
	if clean_df.owner_nsid[i] == clean_df.owner_nsid[i-1] and clean_df.dates_taken[i] - clean_df.dates_taken[i-1] < max_dwell_time:
		clean_df.loc[i, 'trip'] = trip_name
		trip_name += 1
	elif clean_df.owner_nsid[i] != clean_df.owner_nsid[i-1]:
		clean_df.loc[i, 'trip'] = ''

# getting the transition list from identified trip
for i, row in clean_df.iloc[1:].iterrows():
	if clean_df.loc[i, 'trip'] != '':
		transition = (clean_df.cluster[i-1], clean_df.cluster[i])
		transition_list.append(transition)

print('Total transition recorded:', len(transition_list))
print(transition_list)

Total transition recorded: 3107
[(4, 4), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 4), (0, 5), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 0), (3, 5), (1, 1), (3, 0), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 0), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 4), (4, 0), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (0, 1), (1, 1), (0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 4), (4, 3), (3, 4), (4, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (5, 3), (0, 0), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (0, 1), (1, 1), (1, 0), (0, 1), (2, 2), (2, 0), (3, 0), (0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (1, 0), (0, 0), (0, 0), (0, 0), (0, 3), (3, 1), (1, 1), (0, 0), (0, 0), (0, 2), (2, 2), (3, 3), (1, 1), (1, 1), (1, 4), (1, 1), (1, 1), (0, 0), (0, 0), (0, 2), (2, 0),

#####Splitting the data into training and test datasett

In [7]:
train_ratio = round(0.8*len(transition_list))
test_ratio = round(0.2*len(transition_list))
random.shuffle(transition_list)

train_data = transition_list[:train_ratio]
test_data = transition_list[-test_ratio:]

print('Total object in training dataset:', len(train_data))
print('Total object in test dataset:',len(test_data))

Total object in training dataset: 2486
Total object in test dataset: 621


#####Calculating the transition matrix

In [8]:
pairs = pd.DataFrame(train_data, columns=['From', 'To'])
counts = pairs.groupby('From')['To'].value_counts()
sum = pairs.groupby('From').sum()
probability_matrix = pd.DataFrame(counts / counts.sum()).unstack()
transition_matrix = pd.DataFrame(pd.crosstab(pairs.From, pairs.To).apply(lambda r: r/r.sum(), axis=1))

# print('Probablity Matrix:')
# print(probability_matrix)
print('Transition Matrix:')
# print(transition_matrix)
transition_matrix

# probability_matrix.to_csv('/content/drive/My Drive/Colab Notebooks/FlickrCrawling/Sarbagita/CSV/Probability Matrix.csv')
# transition_matrix.to_csv('/content/drive/My Drive/Colab Notebooks/FlickrCrawling/Sarbagita/CSV/Transition Matrix.csv')

Transition Matrix:


To,0,1,2,3,4,5
From,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.771694,0.073347,0.051653,0.048554,0.028926,0.025826
1,0.091133,0.809113,0.011084,0.013547,0.038177,0.036946
2,0.256684,0.064171,0.59893,0.042781,0.016043,0.02139
3,0.207627,0.076271,0.055085,0.622881,0.016949,0.021186
4,0.156863,0.112745,0.009804,0.044118,0.647059,0.029412
5,0.367089,0.177215,0.101266,0.101266,0.063291,0.189873


#####Evaluating the accuracy of the markov chain model

In [9]:
test_data_df = pd.DataFrame(test_data, columns=['from','to'])
prediction_list = pd.DataFrame(columns=['from','to'])
prediction_list['from'] = test_data_df['from']

# predicting the next cluster visited based on the highest probability in transition matrix
for i, row in prediction_list.iterrows():
    from_cluster = prediction_list.loc[i, 'from']
    prediction_list.loc[i, 'to'] = transition_matrix[from_cluster].idxmax(axis=1)

# calculating the accuracy of the prediction
accurate_prediction = 0

for i, row in prediction_list.iterrows():
    if prediction_list.loc[i, 'to'] == test_data_df.loc[i, 'to'] and prediction_list.loc[i, 'from'] == test_data_df.loc[i, 'from']:
        accurate_prediction += 1

# accuracy = accuracy_score(test_data_df['to'], prediction_list['to'])
accuracy = accurate_prediction / len(prediction_list)

print('Accurate prediction made:', accurate_prediction)
print('Accuracy of Markov Chain Model: '+'{:.1%}'.format(accuracy))

Accurate prediction made: 444
Accuracy of Markov Chain Model: 71.5%
