In [1]:
import pandas as pd # for dataframes manipulation
#import matplotlib.pyplot as plt

# Import the data
firms = pd.read_csv('SP_500_firms.csv')
close_prices = pd.read_csv('SP_500_close_2015.csv')

# Create subset of the original dataset which includes
# only 10 firms - used for test purposes
firms_small = firms.iloc[:10, :]
close_prices_small = close_prices.iloc[:, :11]

print(firms_small[:5])

  Symbol                 Name                  Sector
0    MMM           3M Company             Industrials
1    ABT  Abbott Laboratories             Health Care
2   ABBV               AbbVie             Health Care
3    ACN        Accenture plc  Information Technology
4   ATVI  Activision Blizzard  Information Technology


In [2]:
print(close_prices_small[:5])

         Date         MMM        ABT       ABBV        ACN       ATVI  \
0  2015-01-02  156.678596  43.160459  61.986410  86.129228  19.765196   
1  2015-01-05  153.145069  43.170070  60.819874  84.674997  19.490271   
2  2015-01-06  151.511999  42.679830  60.518833  84.064223  19.126976   
3  2015-01-07  152.610267  43.025880  62.964797  85.828689  18.714587   
4  2015-01-08  156.267949  43.910238  63.623323  87.137495  18.901144   

          AYI       ADBE         AAP        AES        AET  
0  139.234407  72.339996  158.132353  12.860543  87.354435  
1  135.889914  71.980003  156.047994  12.494440  86.173965  
2  134.187800  70.529999  155.938290  12.212822  86.301853  
3  136.566769  71.110001  159.289228  12.231597  88.033197  
4  141.344618  72.919998  160.685446  12.419342  90.885990  


In [3]:
# ### 1 ###
# Create a function that takes a dataframe with closing prices 
# and returns a dataframe with daily returns
# Date is to define whether the dataframe has a first
def StockReturns(cl_p, Date = True):
    """
    Input:  The dataframe with the prices of the stocks
            A logical argument that equals True if the first column
            in the dataframe represents dates, False otherwise
    Output: A dataframe with the daily returns of the stocks - same 
            number of columns as the input vector and one row less 
            than the input vector
    """    
    if Date:
        d_ret = pd.DataFrame(cl_p.iloc[1:, 0], columns=['Date'])
        j = 1
    else:
        d_ret = pd.DataFrame()
        j = 0
    for i in range(j, cl_p.shape[1]):
        d_ret[cl_p.columns[i]] = (cl_p.iloc[1:,i].values - cl_p.iloc[:-1,i].values) / cl_p.iloc[:-1,i].values
    return d_ret

In [4]:
# Test the previous function on the small dataset

# Daily returns including the Date column    
daily_returns = StockReturns(close_prices_small) 
print(daily_returns.iloc[:5, :5])

# Daily returns excluding the Date column
daily_returns_2 = StockReturns(close_prices_small.iloc[:, 1:], Date = False)
print(daily_returns_2.iloc[:5, :5])   

         Date       MMM       ABT      ABBV       ACN
1  2015-01-05 -0.022553  0.000223 -0.018819 -0.016884
2  2015-01-06 -0.010664 -0.011356 -0.004950 -0.007213
3  2015-01-07  0.007249  0.008108  0.040417  0.020989
4  2015-01-08  0.023967  0.020554  0.010459  0.015249
5  2015-01-09 -0.012284 -0.010508 -0.027355 -0.001113
        MMM       ABT      ABBV       ACN      ATVI
0 -0.022553  0.000223 -0.018819 -0.016884 -0.013910
1 -0.010664 -0.011356 -0.004950 -0.007213 -0.018640
2  0.007249  0.008108  0.040417  0.020989 -0.021561
3  0.023967  0.020554  0.010459  0.015249  0.009969
4 -0.012284 -0.010508 -0.027355 -0.001113 -0.018182


In [5]:
# ### 2 ###
# Create a function that returns a list of correlations between all the
# firms
def Correlations(d_ret):
    """
    Input:  A dataframe with the daily returns of the stocks - The 
            first column can either indicate dates or not
    Output: A list of tuples. Each tuple in the list have 3 elements:
            1. The correlation between two firms
            2 and 3. The firms for which we compute the correlation
    """
    cor = d_ret.corr()
    n = int(cor.shape[0])
    cor_list = []
    for i in range(1, n):
        for j in range(0, i):
            cor_list.append((cor.iloc[i, j], cor.columns.values[i], cor.columns.values[j]))
    return cor_list

In [6]:
# Test the previous function on the small dataset

# Correlation list
correlation_list = Correlations(daily_returns)
print(correlation_list[:5])

[(0.59866616402973749, 'ABT', 'MMM'), (0.32263699601940204, 'ABBV', 'MMM'), (0.48366885347180477, 'ABBV', 'ABT'), (0.63205934885601844, 'ACN', 'MMM'), (0.64408052989752662, 'ACN', 'ABT')]


In [7]:
# Create a function that sorts the list based on the correlation in 
# decreasing order
def SortCorrs(cor_list): 
    """
    Input:  A list of tuples. Each tuple in the list have 3 elements:
            1. The correlation between two firms
            2 and 3. The firms for which we compute the correlation
    Output: Returns the same list of tuples ordered based on the
            first element of the tuples, e.g. the correlation
    """
    return sorted(cor_list, reverse = True)

In [8]:
# Test the previous function

ordered_list = SortCorrs(correlation_list)
print(ordered_list[:5])

[(0.64408052989752662, 'ACN', 'ABT'), (0.63205934885601844, 'ACN', 'MMM'), (0.59866616402973749, 'ABT', 'MMM'), (0.58567123295127221, 'ADBE', 'ABT'), (0.56075923030727226, 'ADBE', 'ACN')]


In [9]:
firmset = set(firms_small['Symbol'])
firmset


{'AAP', 'ABBV', 'ABT', 'ACN', 'ADBE', 'AES', 'AET', 'ATVI', 'AYI', 'MMM'}

In [10]:
firmdict = {}
for firm in firmset:
    firmdict[firm] = [firm, firm]

In [14]:
def findBottomNode(firm, firmDict):
    if (firmDict[firm][1] == firm):
        return firm
    else:
        return(findBottomNode(firmDict[firm][1],firmDict))

In [15]:
def findTopNode(firm, firmDict):
    if (firmDict[firm][0] == firm):
        return firm
    else:
        return(findTopNode(firmDict[firm][0],firmDict))

In [16]:
coritem = ordered_list.pop(0)
coritem

(0.63205934885601844, 'ACN', 'MMM')

In [17]:
lastnodefromsource = findBottomNode(coritem[1],firmdict)
lastnodefromdest = findBottomNode(coritem[2],firmdict)
if (lastnodefromsource == lastnodefromdest):
    pass
else:
    firstnodefromdest = findTopNode(coritem[2],firmdict)
    firmdict[lastnodefromsource][1] = firstnodefromdest
    firmdict[firstnodefromdest][0] = lastnodefromsource

In [18]:
firmdict

{'AAP': ['AAP', 'AAP'],
 'ABBV': ['ABBV', 'ABBV'],
 'ABT': ['ABT', 'ABT'],
 'ACN': ['ACN', 'MMM'],
 'ADBE': ['ADBE', 'ADBE'],
 'AES': ['AES', 'AES'],
 'AET': ['AET', 'AET'],
 'ATVI': ['ATVI', 'ATVI'],
 'AYI': ['AYI', 'AYI'],
 'MMM': ['ACN', 'MMM']}

In [19]:
coritem = ordered_list.pop(0)
coritem

(0.59866616402973749, 'ABT', 'MMM')

In [20]:
lastnodefromsource = findBottomNode(coritem[1],firmdict)
lastnodefromdest = findBottomNode(coritem[2],firmdict)
if (lastnodefromsource == lastnodefromdest):
    pass
else:
    firstnodefromdest = findTopNode(coritem[2],firmdict)
    firmdict[lastnodefromsource][1] = firstnodefromdest
    firmdict[firstnodefromdest][0] = lastnodefromsource

In [21]:
firmdict

{'AAP': ['AAP', 'AAP'],
 'ABBV': ['ABBV', 'ABBV'],
 'ABT': ['ABT', 'ACN'],
 'ACN': ['ABT', 'MMM'],
 'ADBE': ['ADBE', 'ADBE'],
 'AES': ['AES', 'AES'],
 'AET': ['AET', 'AET'],
 'ATVI': ['ATVI', 'ATVI'],
 'AYI': ['AYI', 'AYI'],
 'MMM': ['ACN', 'MMM']}

In [22]:
coritem = ordered_list.pop(0)
coritem

(0.58567123295127221, 'ADBE', 'ABT')

In [23]:
lastnodefromsource = findBottomNode(coritem[1],firmdict)
lastnodefromdest = findBottomNode(coritem[2],firmdict)
if (lastnodefromsource == lastnodefromdest):
    pass
else:
    firstnodefromdest = findTopNode(coritem[2],firmdict)
    firmdict[lastnodefromsource][1] = firstnodefromdest
    firmdict[firstnodefromdest][0] = lastnodefromsource

In [24]:
firmdict

{'AAP': ['AAP', 'AAP'],
 'ABBV': ['ABBV', 'ABBV'],
 'ABT': ['ADBE', 'ACN'],
 'ACN': ['ABT', 'MMM'],
 'ADBE': ['ADBE', 'ABT'],
 'AES': ['AES', 'AES'],
 'AET': ['AET', 'AET'],
 'ATVI': ['ATVI', 'ATVI'],
 'AYI': ['AYI', 'AYI'],
 'MMM': ['ACN', 'MMM']}

In [25]:
# ### 3 ###
# Create the function that performs the clustering algorithm
def ClusteringAlg2(firm_set, ord_list, k = 5):
    '''
    Input:
        - A set of strings (expected to be a set of firm stock ticker strings)
        - An ordered list of items with:
            - A ranked value (expected to be stock correlations)
            - An element from the set (expected to be a firm stock ticker string)
            - Another element from the set (expected to be a firm stock ticker string)
        - An integer, which determines the number of iterations
    '''
    
    ##change the firm_set into a dictionary which has
    ##a key - which is the firm name
    ##a list - including
    ##    a "prev" firm name
    ##    and a "next" firm name
    firmdict = {}
    for firm in firm_set:
        firmdict[firm] = [firm, firm]
    setOfStartNodes = set(firm_set)
    
    ##complete the loop k times
    for i in range(0,k):
        
        #take an item from the ordered list
        coritem = ord_list.pop(0)
        
        ##error handling - check that both the source and destination
        ##elements are in the list of firms
        if(coritem[1] not in firm_set or coritem[2] not in firm_set):
            return("The ordered list must contain elements from the set")
        
        ##check if the firms are already in the same "set"
        ##i.e. have the same bottom node
        lastnodefromsource = findBottomNode(coritem[1],firmdict)
        lastnodefromdest = findBottomNode(coritem[2],firmdict)
        if (lastnodefromsource == lastnodefromdest):
            pass
        
        else:
        ## Otherwise, get the top node of the destination
            firstnodefromdest = findTopNode(coritem[2],firmdict)
        ## set the bottom node of the source to have a
        ## "next" pointer to the top node of the destination       
            firmdict[lastnodefromsource][1] = firstnodefromdest
        ## & visa versa set the "prev" pointer of the top node of the
        ## destination to the bottom node of the source
        ## i.e. join the sets such that there is only one "line"
        ## from start to finish
            firmdict[firstnodefromdest][0] = lastnodefromsource
        ## Remove the start node of the destination from start nodes
        ## as it now has a previous node, and is not a start.
            setOfStartNodes.remove(firstnodefromdest)
    
    return (firmdict, setOfStartNodes)

In [26]:
firmset = set(firms_small['Symbol'])
firmset
ClusteringAlg2(firmset, ordered_list[0:4], k = 4)

({'AAP': ['AAP', 'AAP'],
  'ABBV': ['ABBV', 'AYI'],
  'ABT': ['AYI', 'ABT'],
  'ACN': ['ADBE', 'ATVI'],
  'ADBE': ['ADBE', 'ACN'],
  'AES': ['AES', 'AES'],
  'AET': ['AET', 'AET'],
  'ATVI': ['ACN', 'ATVI'],
  'AYI': ['ABBV', 'ABT'],
  'MMM': ['MMM', 'MMM']},
 {'AAP', 'ABBV', 'ADBE', 'AES', 'AET', 'MMM'})

In [30]:
def ReturnClusters(firmdict, setOfStartNodes):
    startnodes = set(setOfStartNodes)
    listoflists = []
    while (len(startnodes) != 0):
        currentnode = startnodes.pop()
        currentlist = [currentnode]
        while (firmdict[currentnode][1] != currentnode):
            currentnode = firmdict[currentnode][1]
            currentlist.append(currentnode)
        listoflists.append(currentlist)
    return listoflists

In [33]:
ordered_list = SortCorrs(correlation_list)
ordered_list[0:4]
a, b = ClusteringAlg2(firmset, ordered_list[0:4], k = 4)
ReturnClusters(a,b)

[['AET'],
 ['ABBV'],
 ['AAP'],
 ['ATVI'],
 ['AYI'],
 ['ADBE', 'ACN', 'ABT', 'MMM'],
 ['AES']]

In [34]:
# ### 3 - full model.###
# Create the function that performs the clustering algorithm
def ClusteringAlg3(firm_set, ord_list, k = 5):
    '''
    Input:
        - A set of strings (expected to be a set of firm stock ticker strings)
        - An ordered list of items with:
            - A ranked value (expected to be stock correlations)
            - An element from the set (expected to be a firm stock ticker string)
            - Another element from the set (expected to be a firm stock ticker string)
        - An integer, which determines the number of iterations
    '''
    
    ##change the firm_set into a dictionary which has
    ##a key - which is the firm name
    ##a list - including
    ##    a "prev" firm name
    ##    and a "next" firm name
    firmdict = {}
    for firm in firm_set:
        firmdict[firm] = [firm, firm]
    setOfStartNodes = set(firm_set)
    
    ##complete the loop k times
    for i in range(0,k):
        
        #take an item from the ordered list
        coritem = ord_list.pop(0)
        
        ##error handling - check that both the source and destination
        ##elements are in the list of firms
        if(coritem[1] not in firm_set or coritem[2] not in firm_set):
            return("The ordered list must contain elements from the set")
        
        ##check if the firms are already in the same "set"
        ##i.e. have the same bottom node
        lastnodefromsource = findBottomNode(coritem[1],firmdict)
        lastnodefromdest = findBottomNode(coritem[2],firmdict)
        if (lastnodefromsource == lastnodefromdest):
            pass
        
        else:
        ## Otherwise, get the top node of the destination
            firstnodefromdest = findTopNode(coritem[2],firmdict)
        ## set the bottom node of the source to have a
        ## "next" pointer to the top node of the destination       
            firmdict[lastnodefromsource][1] = firstnodefromdest
        ## & visa versa set the "prev" pointer of the top node of the
        ## destination to the bottom node of the source
        ## i.e. join the sets such that there is only one "line"
        ## from start to finish
            firmdict[firstnodefromdest][0] = lastnodefromsource
        ## Remove the start node of the destination from start nodes
        ## as it now has a previous node, and is not a start.
            setOfStartNodes.remove(firstnodefromdest)
    
    return (ReturnClusters(firmdict, setOfStartNodes))

In [32]:
ordered_list = SortCorrs(correlation_list)
ordered_list[0:4]
ClusteringAlg3(firmset, ordered_list[0:4], k = 4)

[['AET'],
 ['ABBV'],
 ['AAP'],
 ['ATVI'],
 ['AYI'],
 ['ADBE', 'ACN', 'ABT', 'MMM'],
 ['AES']]

To do: plot the sets with different colours on the linegraph
