In [168]:
import pandas as pd # for dataframes manipulation
import numpy as np
import matplotlib.pyplot as plt
import csv

# Import the data
firms = pd.read_csv('SP_500_firms.csv', index_col = 0)
close_prices = pd.read_csv('SP_500_close_2015.csv')

firms_small = firms.iloc[:504, :]
close_prices_small = close_prices.iloc[:, :497]

#print(firms_small[:5])
#print(close_prices_small[:5])

In [45]:
def StockReturns(cl_p, Date = True):
    """
    Input:  The dataframe with the prices of the stocks
            A logical argument that equals True if the first column
            in the dataframe represents dates, False otherwise
    Output: A dataframe with the daily returns of the stocks - same 
            number of columns as the input vector and one row less 
            than the input vector
    """    
    if Date:
        d_ret = pd.DataFrame(cl_p.iloc[1:, 0], columns=['Date'])
        j = 1
    else:
        d_ret = pd.DataFrame()
        j = 0
    for i in range(j, cl_p.shape[1]):
        d_ret[cl_p.columns[i]] = (cl_p.iloc[1:,i].values - cl_p.iloc[:-1,i].values) / cl_p.iloc[:-1,i].values
    return d_ret
    
    
# Daily returns including the Date column    
daily_returns = StockReturns(close_prices_small) 
#print(daily_returns.iloc[:5, :5])

# Daily returns excluding the Date column
daily_returns_2 = StockReturns(close_prices_small.iloc[:, 1:], Date = False)
#print(daily_returns_2.iloc[:5, :5])   

In [35]:
def Correlations(d_ret):
    """
    Input:  A dataframe with the daily returns of the stocks - The 
            first column can either indicate dates or not
    Output: A list of tuples. Each tuple in the list have 3 elements:
            1. The correlation between two firms
            2 and 3. The firms for which we compute the correlation
    """
    cor = d_ret.corr()
    n = int(cor.shape[0])
    cor_list = []
    for i in range(1, n):
        for j in range(0, i):
            cor_list.append((cor.iloc[i, j], cor.columns.values[i], cor.columns.values[j]))
    return cor_list
    

correlation_list = Correlations(daily_returns)
#print(correlation_list[:5])

In [147]:
def SortCorrs(cor_list): 
    """
    Input:  A list of tuples. Each tuple in the list have 3 elements:
            1. The correlation between two firms
            2 and 3. The firms for which we compute the correlation
    Output: Returns the same list of tuples ordered based on the
            first element of the tuples, e.g. the correlation
    """
    return sorted(cor_list, reverse = True)
    

ordered_list = SortCorrs(correlation_list)
#print(ordered_list[:5])


In [148]:
def clusteringAlg(ord_list, k = 0):
    """
    Input:  
    ord_list: The ordered list of tuples which include the
              correlations between firms and the firms themselves
    k: The number of iterations for the clustering algorithm
    Output: 
    A list of sets where each set represents an individual 
    cluster
    """
    # Initialize the list of sets. Each set represents a cluster
    # which initialy includes only one firm
    sets = []
    for i in range(len(ord_list)):
        if not({ord_list[i][1]} in sets):
            sets.append({ord_list[i][1]})
        if not({ord_list[i][2]} in sets):
            sets.append({ord_list[i][2]})     
    # Repeat the algorithm k times
    # In each iteration we check the k-th tuple of correlations list
    # and whether the 2 firms in that tuple are already in the same
    # set. If they do, we move on to the next tuple, otherwise we merge
    for j in range(min(k, len(ord_list))):
        nd1 = ord_list[j][1]
        nd2 = ord_list[j][2]
        fl1, fl2 = False, False    
        for i in range(len(sets)):
            if (nd1 in sets[i]) and fl1 == False:
                idx1 = i
                fl1 = True
            if (nd2 in sets[i]) and fl2 == False:
                idx2 = i
                fl2 = True
        if idx1 != idx2:
            sets[idx1] = sets[idx1].union(sets[idx2])
            sets.remove(sets[idx2])
    return sets

The greedy algorithm used is Single-linkage Clustering, one of the many methods under Hierarchical Clustering. Single-linkage clustering is based on agglomerative clustering. It involves starting with each firm in a cluster of its own and then combining two clusters at each iteration of the algorithm. It chooses the two clusters that are 'close to', i.e. highly correlated with each other. 

This method is also known as Nearest Neighbour Clustering which is used in the traveling salesman problem. In TSP, a salesman wants to travel to N number of cities that are connected to each other. The connections between cities have weights (e.g. costs/distance) attached to them and the salesman wants to minimize the total cost incurred or total distance travelled, i.e. the salesman is also looking for cities that are 'close to' each other.

In [187]:
def getStockDetails(ticker, clusterSets, firmDF):
    # if the ticker given is in a single set cluster
    if {ticker} in clusterSets:
        return firmDF.loc[ticker, :]
    else:
        for i in range(len(clusterSets)):
            if ticker in clusterSets[i]:
                return firmDF.loc[clusterSets[i], :].sort_values('Sector')

In [196]:
getStockDetails("BAC", clusteringAlg(ordered_list,30), firms)

Unnamed: 0_level_0,Name,Sector
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
JPM,JPMorgan Chase & Co.,Financials
BAC,Bank of America Corp,Financials
C,Citigroup Inc.,Financials


In [193]:
getStockDetails("BAC", clusteringAlg(ordered_list,50), firms)

Unnamed: 0_level_0,Name,Sector
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
RF,Regions Financial Corp.,Financials
CMA,Comerica Inc.,Financials
C,Citigroup Inc.,Financials
WFC,Wells Fargo,Financials
STI,SunTrust Banks,Financials
JPM,JPMorgan Chase & Co.,Financials
BBT,BB&T Corporation,Financials
KEY,KeyCorp,Financials
HBAN,Huntington Bancshares,Financials
USB,U.S. Bancorp,Financials


In [194]:
getStockDetails("BAC", clusteringAlg(ordered_list,100), firms)

Unnamed: 0_level_0,Name,Sector
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
C,Citigroup Inc.,Financials
WFC,Wells Fargo,Financials
STI,SunTrust Banks,Financials
JPM,JPMorgan Chase & Co.,Financials
KEY,KeyCorp,Financials
BAC,Bank of America Corp,Financials
RF,Regions Financial Corp.,Financials
GS,Goldman Sachs Group,Financials
CMA,Comerica Inc.,Financials
FITB,Fifth Third Bancorp,Financials


In [197]:
getStockDetails("BAC", clusteringAlg(ordered_list,500), firms)

Unnamed: 0_level_0,Name,Sector
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
LNC,Lincoln National,Financials
PRU,Prudential Financial,Financials
C,Citigroup Inc.,Financials
MMC,Marsh & McLennan,Financials
WFC,Wells Fargo,Financials
ETFC,E*Trade,Financials
STI,SunTrust Banks,Financials
BRK-B,Berkshire Hathaway,Financials
JPM,JPMorgan Chase & Co.,Financials
BK,The Bank of New York Mellon Corp.,Financials


In [198]:
getStockDetails("BAC", clusteringAlg(ordered_list,1000), firms)

Unnamed: 0_level_0,Name,Sector
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
SNA,Snap-On Inc.,Consumer Discretionary
SWK,Stanley Black & Decker,Consumer Discretionary
LNC,Lincoln National,Financials
RF,Regions Financial Corp.,Financials
AON,Aon plc,Financials
USB,U.S. Bancorp,Financials
IVZ,Invesco Ltd.,Financials
L,Loews Corp.,Financials
TRV,The Travelers Companies Inc.,Financials
GS,Goldman Sachs Group,Financials


After running the algorithm for increasing values of k, it is observed that, with an increase in k, more elements are added to existing clusters and more new clusters are formed. Clusters are formed with stocks belonging to firms within the same indsutry. However, after a certain value of k, we see clusters merging and becoming more 'broad' – for example, what was initially two clusters – insurance and banking – becomes a general ‘finance’ cluster. 

In the above tables, Bank of America Merrill Lynch was tracked for different values of k to see how the composition of its cluster changes with an increase in k. Until k = 500, we can see only companies from the Financial sector within its cluster. For k = 1000, firms from the Consumer Discretionary, Industrial and Information Technology sector join the sector. This may happen because one or two stocks from these indsutries may have a lower correlation with finance companies.