In [None]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import networkx as nx
!pip install pyvis
from pyvis import network as net

In [None]:
pairs = pd.read_csv('../input/foursquare-location-matching/pairs.csv')

# 1. Introduction

The main goal of this notebook was to get a better understanding of the relationships between 'categories'. I used the 'pairs.csv' dataframe for this analysis. I measured this relationship as the number of times category B was a point of interest to category A divided by the total occurences of category A. I excluded compound categroies (categories consisting of multiple categories) and only the top 500 of the remaining categories were used. Which left us with 199270 rows for the analysis. Section 2 gives a brief overview over the raltionships and you will find an interactive map of categories with relationships stronger than 0.1 in Section 3.

In [None]:
pairs = pairs[~((pairs.categories_1.str.contains(',')) | (pairs.categories_2.str.contains(',')))]
pairs = pairs[pairs['match'] == True]
top_categories = pairs['categories_1'].append(pairs['categories_2']).value_counts()[:500]
top_categories

In [None]:
print(f'sum of top categories: {top_categories.sum()}')

In [None]:
pairs_top = pairs[pairs.categories_1.isin(top_categories.index) & pairs.categories_2.isin(top_categories.index)]
print(f'shape of remaining dataframe: {pairs_top.shape}')

In [None]:
dict_relatedness = {}

for index, row in tqdm(pairs_top.iterrows(), total=len(pairs_top)):
    if type(row['categories_1']) == str and type(row['categories_2']) == str:
        if row['categories_1'] not in dict_relatedness:
            dict_relatedness[row['categories_1']] = {'count': 0}
        if row['categories_2'] not in dict_relatedness:
            dict_relatedness[row['categories_2']] = {'count': 0}
        if row['categories_2'] not in dict_relatedness[row['categories_1']]:
            dict_relatedness[row['categories_1']][row['categories_2']] = 0
        if row['categories_1'] not in dict_relatedness[row['categories_2']]:
            dict_relatedness[row['categories_2']][row['categories_1']] = 0
        dict_relatedness[row['categories_1']]['count'] += 1
        dict_relatedness[row['categories_2']]['count'] += 1
        dict_relatedness[row['categories_1']][row['categories_2']] += 1
        dict_relatedness[row['categories_2']][row['categories_1']] += 1
        

relatedness = []
first = []
second = []
edges = []

for key in tqdm(dict_relatedness, total=len(dict_relatedness)):
    for cat in dict_relatedness[key]:
        if cat != 'count':
            first.append(key)
            second.append(cat)
            edges.append((key,cat,np.around(dict_relatedness[key][cat]/dict_relatedness[key]['count'], decimals=2)))
            relatedness.append(dict_relatedness[key][cat]/dict_relatedness[key]['count'])
            
related_df = pd.DataFrame({'first': first, 'second': second, 'relatedness': relatedness, 'edges': edges})
related_df = related_df.sort_values('relatedness', ascending=False).reset_index(drop=True)

# 2. A brief look at the resulting data

In [None]:
related_df.loc[(related_df['first'] == related_df['second'])].head()

In [None]:
related_df.loc[(related_df['first'] == related_df['second']) & (related_df['relatedness'] <= 0.3)]

In [None]:
related_df.loc[~(related_df['first'] == related_df['second'])]

In [None]:
related_df[related_df['first'] == 'Diners'][:10]

# 3. Interactive visualization of the network

In [None]:
df_for_network = related_df[(related_df['first'] != related_df['second']) & (related_df['relatedness'] >= 0.1)]
df_for_network.head()

**The following directed graph shows the relationships between categories. This graph only shows connections that are stronger than 0.1. You can zoom in and drag around the nodes to get a better understanding.**

In [None]:
G=nx.DiGraph()

G.add_nodes_from(df_for_network['first'])
G.add_nodes_from(df_for_network['second'])
G.add_weighted_edges_from(df_for_network['edges'])

g = net.Network(notebook=True, directed =True, height='750px', width='100%', bgcolor='#222222', font_color='white')
g.from_nx(G)
g.show('example.html')

# Thank you for reading.

In [None]:
related_df.to_csv('related_df_FoursquareLM.csv', index=False)