In [15]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
import matplotlib
import networkx as nx
from scipy.sparse import *
from pandas.api.types import CategoricalDtype
import numpy as np
import math
import os

%matplotlib inline

In [16]:
def to_stack_df(firm_and_city_df):
    """
    Stack Firm columns to Firm Index
    """
    firm_and_city_df = firm_and_city_df.set_index('City')
    
    city_count = firm_and_city_df.shape[0]
    firm_count = firm_and_city_df.shape[1]
    
    stack_firm_and_city_df = firm_and_city_df.stack().reset_index()
    stack_firm_and_city_df.columns = ['City', 'Firm', 'Size']
    
    return stack_firm_and_city_df

def create_interlocking_world_city_network(stack_firm_and_city_df):
    """
    Create interlocking world city network from stack dataframe (City,Firm,Size)
    """
    
    base_df = stack_firm_and_city_df

    City_c = CategoricalDtype(sorted(base_df.City.unique()), ordered=True)
    Firm_c = CategoricalDtype(sorted(base_df.Firm.unique()), ordered=True)

    row = base_df.City.astype(City_c).cat.codes
    col = base_df.Firm.astype(Firm_c).cat.codes

    sparse_matrix = csr_matrix((base_df['Size'], (row, col)), shape=(City_c.categories.size, Firm_c.categories.size))
    base_connectivity_matrix = sparse_matrix * sparse_matrix.transpose()

    base_connectivity_matrix = tril(base_connectivity_matrix, k=-1)

    co_connectivity_matrix = base_connectivity_matrix.tocoo()
    iwcn_df = pd.DataFrame({'x_City':co_connectivity_matrix.row, 'y_City':co_connectivity_matrix.col, 'connectivity':co_connectivity_matrix.data})

    city_df = pd.DataFrame({'id':range(City_c.categories.size), 'City':City_c.categories})
    iwcn_df = iwcn_df.merge(city_df, how='left', left_on='x_City', right_on='id')
    iwcn_df = iwcn_df.merge(city_df, how='left', left_on='y_City', right_on='id')

    iwcn_df = iwcn_df.iloc[:,[2,4,6]]
    
    return iwcn_df

def get_node_degree_centrality(iwcn_df):
    """
    Return df with node degree centrality metrics for interloking world city network
    """
    G = nx.from_pandas_edgelist(iwcn_df, 'City_x', 'City_y', ['connectivity'])

    node_degree_centrality = G.degree(weight='connectivity')
    
    #save degree centrality
    node_degree_centrality_dict = dict(node_degree_centrality)
    node_degree_centrality_df = pd.DataFrame({'city':list(node_degree_centrality_dict.keys()), 'centrality':list(node_degree_centrality_dict.values())})
    
    node_degree_centrality_df['weighted_centrality'] = node_degree_centrality_df.centrality/((firm_count * 20)*city_count-1)

    return node_degree_centrality_df

def create_df_for_regression_model(stack_firm_and_city_df):
    counts_of_branch_in_city_by_size = stack_firm_and_city_df.groupby('City')['Size'].value_counts().unstack().fillna(0)
    counts_of_branch_in_firms_by_size = stack_firm_and_city_df.groupby('Firm')['Size'].value_counts().unstack().fillna(0)
    
    for_regression_model = stack_firm_and_city_df.merge(counts_of_branch_in_firms_by_size, how='left',on='Firm')
    for_regression_model.columns = ['City','Firm','Size','F0','F1','F2','F3','F4','F5']
    
    for_regression_model = for_regression_model.merge(counts_of_branch_in_city_by_size, how='left', on='City')
    for_regression_model.columns = ['City','Firm','Size','F0','F1','F2','F3','F4','F5','C0','C1','C2','C3','C4','C5']
    
    return for_regression_model
    
def preprocessing_pipe(firm_and_city_df, out_folder_name):
    """
    Create interlocking world city network, calc node degree centrality, create file for regression model
    """
    directory = '../Data/ProcessingData/{folder}'.format(folder=out_folder_name)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    stack_firm_and_city_df = to_stack_df(firm_and_city_df)
    stack_firm_and_city_df.to_csv('../Data/ProcessingData/{folder}/firm_and_city.csv'.format(folder=out_folder_name))
    
    iwcn_df = create_interlocking_world_city_network(stack_firm_and_city_df)
    iwcn_df.to_csv('../Data/ProcessingData/{folder}/interlocking_world_city_network.csv'.format(folder=out_folder_name))
    
    node_degree_centrality_df = get_node_degree_centrality(iwcn_df)
    node_degree_centrality_df.to_csv('../Data/ProcessingData/{folder}/node_degree_centrality.csv'.format(folder=out_folder_name))
    
    for_regression_model = create_df_for_regression_model(stack_firm_and_city_df)
    for_regression_model.to_csv('../Data/ProcessingData/{folder}/for_regression_model.csv'.format(folder=out_folder_name))

In [17]:
#2018
firm_and_city_18_df = pd.read_excel('../Data/RawData/raw_data_2018.xlsx','InPut')
preprocessing_pipe(firm_and_city_18_df, 'data_2018')

#2015
firm_and_city_15_df = pd.read_excel('../Data/RawData/raw_data_2015.xlsx')
preprocessing_pipe(firm_and_city_15_df, 'data_2015')