In [101]:
import pandas as pd # for dataframes manipulation
import numpy as np
import matplotlib.pyplot as plt
import csv

# Import the data
firms = pd.read_csv('SP_500_firms.csv')
close_prices = pd.read_csv('SP_500_close_2015.csv')

firms_small = firms.iloc[:504, :]
close_prices_small = close_prices.iloc[:, :497]

#print(firms_small[:5])
#print(close_prices_small[:5])

firms.iloc[:20,1:3]

Unnamed: 0,Name,Sector
0,3M Company,Industrials
1,Abbott Laboratories,Health Care
2,AbbVie,Health Care
3,Accenture plc,Information Technology
4,Activision Blizzard,Information Technology
5,Acuity Brands Inc,Industrials
6,Adobe Systems Inc,Information Technology
7,Advance Auto Parts,Consumer Discretionary
8,AES Corp,Utilities
9,Aetna Inc,Health Care


In [45]:
def StockReturns(cl_p, Date = True):
    """
    Input:  The dataframe with the prices of the stocks
            A logical argument that equals True if the first column
            in the dataframe represents dates, False otherwise
    Output: A dataframe with the daily returns of the stocks - same 
            number of columns as the input vector and one row less 
            than the input vector
    """    
    if Date:
        d_ret = pd.DataFrame(cl_p.iloc[1:, 0], columns=['Date'])
        j = 1
    else:
        d_ret = pd.DataFrame()
        j = 0
    for i in range(j, cl_p.shape[1]):
        d_ret[cl_p.columns[i]] = (cl_p.iloc[1:,i].values - cl_p.iloc[:-1,i].values) / cl_p.iloc[:-1,i].values
    return d_ret
    
    
# Daily returns including the Date column    
daily_returns = StockReturns(close_prices_small) 
#print(daily_returns.iloc[:5, :5])

# Daily returns excluding the Date column
daily_returns_2 = StockReturns(close_prices_small.iloc[:, 1:], Date = False)
#print(daily_returns_2.iloc[:5, :5])   

In [35]:
def Correlations(d_ret):
    """
    Input:  A dataframe with the daily returns of the stocks - The 
            first column can either indicate dates or not
    Output: A list of tuples. Each tuple in the list have 3 elements:
            1. The correlation between two firms
            2 and 3. The firms for which we compute the correlation
    """
    cor = d_ret.corr()
    n = int(cor.shape[0])
    cor_list = []
    for i in range(1, n):
        for j in range(0, i):
            cor_list.append((cor.iloc[i, j], cor.columns.values[i], cor.columns.values[j]))
    return cor_list
    

correlation_list = Correlations(daily_returns)
#print(correlation_list[:5])

In [147]:
def SortCorrs(cor_list): 
    """
    Input:  A list of tuples. Each tuple in the list have 3 elements:
            1. The correlation between two firms
            2 and 3. The firms for which we compute the correlation
    Output: Returns the same list of tuples ordered based on the
            first element of the tuples, e.g. the correlation
    """
    return sorted(cor_list, reverse = True)
    

ordered_list = SortCorrs(correlation_list)
#print(ordered_list[:5])


In [148]:
def clusteringAlg(ord_list, k = 0):
    """
    Input:  
    ord_list: The ordered list of tuples which include the
              correlations between firms and the firms themselves
    k: The number of iterations for the clustering algorithm
    Output: 
    A list of sets where each set represents an individual 
    cluster
    """
    # Initialize the list of sets. Each set represents a cluster
    # which initialy includes only one firm
    sets = []
    for i in range(len(ord_list)):
        if not({ord_list[i][1]} in sets):
            sets.append({ord_list[i][1]})
        if not({ord_list[i][2]} in sets):
            sets.append({ord_list[i][2]})     
    # Repeat the algorithm k times
    # In each iteration we check the k-th tuple of correlations list
    # and whether the 2 firms in that tuple are already in the same
    # set. If they do, we move on to the next tuple, otherwise we merge
    for j in range(min(k, len(ord_list))):
        nd1 = ord_list[j][1]
        nd2 = ord_list[j][2]
        fl1, fl2 = False, False    
        for i in range(len(sets)):
            if (nd1 in sets[i]) and fl1 == False:
                idx1 = i
                fl1 = True
            if (nd2 in sets[i]) and fl2 == False:
                idx2 = i
                fl2 = True
        if idx1 != idx2:
            sets[idx1] = sets[idx1].union(sets[idx2])
            sets.remove(sets[idx2])
    return sets

In [161]:
def applyClusteringAlg(x):
    for i in x:
        print(clusteringAlg(ordered_list,i))

B = [5,50,100,500,1000]

#applyClusteringAlg(B)

In [149]:
def readNamesIntoDict():
    d = dict()
    input_file = csv.DictReader(open("SP_500_firms.csv"))
    for row in input_file:
        #print(row)
        d[row['Symbol']] = [row['Name'],row['Sector']]
    return d

namesDict = readNamesIntoDict()
namesDict["AAPL"]

#compNames = namesDict.keys()
#compNames

['Apple Inc.', 'Information Technology']

In [160]:
def tracker(sets,x):
    for element in sets:
        if x in element:
            return element
            
A = [10, 50, 100, 500, 1000]

def trackerwithk(A):
    for i in A:
        print(tracker(clusteringAlg(ordered_list,i), "GS"))

trackerwithk(A)

def fullNames(cluster):
    for item in cluster:
        print(namesDict[item])

{'GS'}
{'GS', 'MS'}
{'C', 'WFC', 'STI', 'JPM', 'KEY', 'BAC', 'RF', 'GS', 'CMA', 'FITB', 'MS', 'BBT', 'PNC', 'HBAN', 'USB', 'ZION'}
{'LNC', 'SCHW', 'TRV', 'CB', 'KEY', 'BAC', 'AFL', 'GS', 'PBCT', 'CMA', 'PFG', 'FITB', 'CINF', 'TMK', 'MS', 'MTB', 'STT', 'PNC', 'HBAN', 'MET', 'PRU', 'C', 'MMC', 'WFC', 'ETFC', 'STI', 'BRK-B', 'JPM', 'BK', 'UNM', 'NTRS', 'RF', 'AMP', 'BBT', 'USB', 'AON', 'ZION'}
{'LNC', 'SCHW', 'V', 'CB', 'MMM', 'KEY', 'BAC', 'AFL', 'XL', 'NOC', 'PBCT', 'CMA', 'AMG', 'FITB', 'TMK', 'MS', 'MTB', 'PNC', 'HBAN', 'PRU', 'PH', 'HON', 'FISV', 'WFC', 'ETFC', 'STI', 'ADP', 'LMT', 'LM', 'BK', 'TROW', 'NTRS', 'RF', 'DHR', 'USB', 'IVZ', 'L', 'TRV', 'GS', 'PFG', 'CINF', 'BEN', 'SWK', 'STT', 'AJG', 'AIG', 'MET', 'PAYX', 'MA', 'ROP', 'C', 'MMC', 'JNJ', 'BRK-B', 'JPM', 'AME', 'PGR', 'UNM', 'AMP', 'HIG', 'ITW', 'BLK', 'BBT', 'SNA', 'ETN', 'AON', 'ZION'}


The greedy algorithm used is Single-linkage Clustering, one of the many methods under Hierarchical Clustering. Single-linkage clustering is based on agglomerative clustering. It involves starting with each firm in a cluster of its own and then combining two clusters at each iteration of the algorithm. It chooses the two clusters that are 'close to', i.e. highly correlated with each other. 

This method is also known as Nearest Neighbour Clustering which is used in the traveling salesman problem. In TSP, a salesman wants to travel to N number of cities that are connected to each other. The connections between cities have weights (e.g. costs/distance) attached to them and the salesman wants to minimize the total cost incurred or total distance travelled, i.e. the salesman is also looking for cities that are 'close to' each other.


I ran the algorithm for different vslues of k to see what results we get.

K = 5
{'JCI', 'TYC'} //
 {'GOOG', 'GOOGL'} //
 {'FOX', 'FOXA'} //
 {'NWS', 'NWSA'} //
 {'DISCA', 'DISCK'} //

K = 10
{'JCI', 'TYC'} //
 {'GOOG', 'GOOGL'} //
 {'FOX', 'FOXA'} //
 {'NWS', 'NWSA'} //
 {'DISCA', 'DISCK'} //
 {'BBT', 'HBAN', 'PNC', 'STI'} //
 {'CMS', 'DTE'} //
 {'LNC', 'MET'} //

K = 30
{'JCI', 'TYC'} - Tyco International (Industrials), Johnson Controls (Consumer Discretionary) //
 {'GOOG', 'GOOGL'} //
 {'FOX', 'FOXA'} //
 {'NWS', 'NWSA'} //
 {'DISCA', 'DISCK'} //
 {'BAC', 'C', 'JPM'} //
 {'LNC', 'MET', 'PRU'} //
 {'CMS', 'DTE', 'XEL'} //
 {'BHI', 'HAL'} //
 {'AVB', 'EQR', 'ESS', 'UDR'} //
 {'BBT', 'HBAN', 'KEY', 'PNC', 'STI', 'USB', 'WFC'} //
 {'CMA', 'ZION'} //

K = 50
{'JCI', 'TYC'} - Tyco International (Industrials), Johnson Controls (Consumer Discretionary) //
 {'GOOG', 'GOOGL'} //
 {'FOX', 'FOXA'} //
 {'NWS', 'NWSA'} //
 {'DISCA', 'DISCK'} //
 {'LNC', 'MET', 'PRU'} //
 {'BHI', 'HAL'} //
 {'AVB', 'EQR', 'ESS', 'UDR'} //
 {'CMS', 'DTE', 'ES', 'PNW', 'WEC', 'XEL'} //
 {'BAC', 'BBT', 'C', 'CMA’, ‘HBAN',  'JPM',  'KEY', 'PNC','RF',  'STI', 'USB',  'WFC', 'ZION'} //
 {'AEE', 'LNT'} //
 {'GS', 'MS'} //
 {'SLG', 'VNO'} //

K = 100
{'JCI', 'TYC'} - Tyco International (Industrials), Johnson Controls (Consumer Discretionary) //
 {'GOOG', 'GOOGL'} //
 {'FOX', 'FOXA'} //
 {'NWS', 'NWSA'} //
 {'DISCA', 'DISCK'} //
 {'BHI', 'HAL'} - Baker Hughes (Energy), Halliburton (Energy) //
 {'AVB', 'EQR', 'ESS', 'UDR'} - AvalonBay Communities, Equity Residential, Essex Property Trust, United Dominion Realty Trust //
 {'BXP', 'FRT', 'KIM', 'SLG', 'VNO'} - Boston Properties, Federal Realty Investment Trust, Kimberly Clark, SL Green Realty, Vornado Realty Trust {AIV, AVB, EQR, ESS, GGP, SPG, UDR} – Apartment Investment & Mgmt, AvalonBay Communities, Equity Residential, Essex Property Trust, General Growth Properties Inc, Simon Property Group Inc, United Dominion Realty Trust Inc //
 {'AEE','AEP','CMS', 'D’, ‘DTE’, ‘DUK’, ‘ED’, ‘ES’, ‘LNT', 'PNW',  'SCG',  'SO',  'WEC',  'XEL'} – Ameren Corp, American Electric Power, CMS Energy, Dominion Resources, DTE Energy Co., Duke Energy, Consolidated Edison, Eversource Energy, Alliant Energy Corp, Pinnacle West Capital, SCANA Corp, Southern Co., Wisconsin Energy Corporation, Xcel Energy Inc {EIX, PCG} – Edison Intl, PG&E Corp //
 {'BK', 'NTRS', 'STT'} - The Bank of New York Mellon Corp (banking and financial services corporation), Northern Trust Corp (wealth and asset management), State Street Corp (investment management) //
 {'LNC', 'MET', 'PRU', 'UNM'} – Lincoln National (insurance and investment management), MetLife Inc, Prudential Financial, Unum Group //
 {'BAC’, ‘BBT’, ‘C’, ‘CMA’, ‘FITB’, ‘GS’, ‘HBAN’, ‘JPM', 'KEY',  'MS',  'PNC',  'RF',  'STI',  'USB',  'WFC',  'ZION'} - Bank of America Corp, BB&T Corporation (Financial Services holding company), Citigroup Inc, Comerica Inc (financial services company), Fifth Third Bancorp, Goldman Sachs Group, Huntington Bancshares (American bank holding company), JPMorgan Chase & Co., KeyCorp (American regional bank), Morgan Stanley, PNC Financial Services, Regions Financial Corp., SunTrust Banks, U.S. Bancorp, Wells Fargo (American international banking and financial services holding company), Zions Bancorp {BK, LNC, MET, MTB, NTRS, PBCT, PRU, STT, UNM} – The Bank of New York Mellon Corp., Lincoln National, MetLife Inc, M&T Bank Corp, Northern Trust Corp, People’s United Financial, Prudential Financial, State Street Corp., Unum Group //
 {'DHI', 'LEN'}  - D.R. Horton (home construction company), Lennar Corp (homebuilder) //

K = 200
{'JCI', 'TYC'} //
 {'GOOG', 'GOOGL'} //
 {'FOX', 'FOXA'} //
 {'NWS', 'NWSA'} //
 {'DISCA', 'DISCK'} //
 {'BHI', 'HAL'} //
 {'BAC', 'BBT','BK', 'C',  'CMA',  'FITB',  'GS',  'HBAN',  'JPM',  'KEY',  'LNC'  'MET',  'MS'  'MTB',  'NTRS', 'PBCT', 'PNC', 'PRU', 'RF',  'STI',  'STT', 'UNM', 'USB',  'WFC', 'ZION'} //
 {'AON', 'MMC'} - Aon plc, Marsh & McLennan //
 {'CB', 'TRV'} - Chubb Limited (publicly traded property and casualty insurer) , The Travelers Companies Inc ( second largest writer of U.S. commercial property casualty insurance) //
 {'AIV', 'AVB', 'BXP', 'EQR', 'ESS','FRT','GGP', 'KIM', 'SLG', 'SPG',  'UDR',  'VNO'} //
 {'DHI', 'LEN', 'PHM'} - D.R. Horton (home construction company), Lennar Corp (homebuilder), Pulte Homes Inc. //
 {'HCN', 'HCP'} - Welltower Inc (real estate investment trust), HCP Inc (Real estate investment trust) //
 {'COP', 'CVX', 'XOM'} - ConocoPhillips (Energy), Chevron Corp, Exxon Mobil Corp //
 {'BEN', 'TROW'} - Franklin Resources (American holding company/global investment firm), T. Rowe Price Group (American publicly owned investment firm) //
 {'AEE', 'AEP', 'CMS', 'D', 'DTE', 'DUK',  'ED', 'EIX',  'ES',  'LNT',  'PCG',  'PNW', 'SCG',  'SO',  'WEC',  'XEL'} //


With an increase in k, we see more elements being added to existing clusters and more new clusters being formed. After a certain value of k, we also see clusters becoming more ‘broad’ – for example, what was initially two clusters – insurance and banking – becomes a general ‘finance’ one. Firms within a certain sub-industry, like insurance, are highly likely to have strong correlations with others engaging in similar activities, and slightly lower correlations with firms in a different sub-industry.