In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering

In [None]:
# Display multiple commands output from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
securities = pd.read_csv('../input/nyse-clustering/securities.csv')
nyse = pd.read_csv('../input/fundamentals-df/fundamentals.csv')

In [None]:
nyse.head(5)

In [None]:
nyse = nyse.drop('Unnamed: 0', axis = 1)

In [None]:
nyse.describe().T

In [None]:
pd.options.display.max_rows = 20

In [None]:
nyse.isna().sum()

In [None]:
nyse[nyse.columns[nyse.isna().any()]].isna().sum()

In [None]:
# for renaming the columns
import re
nyse_re = {n : re.sub('[^A-Za-z0-9]+','',n) for n in nyse.columns.values}
nyse.rename(columns = nyse_re, inplace = True)

In [None]:
nyse[nyse.columns[nyse.isna().any()]].isna().sum()

In [None]:
nyse['PeriodEnding'] = pd.to_datetime(nyse['PeriodEnding'])
nyse['ForYear'] = nyse['PeriodEnding'].dt.year.astype('category')

In [None]:
fwm = ["TickerSymbol","ForYear","AccountsPayable","AccountsReceivable","GrossProfit","Liabilities","NetCashFlow","OperatingIncome","TotalAssets","TotalEquity","TotalLiabilities","TotalLiabilities&Equity","TotalRevenue","EarningsPerShare"]
to_drop= [x for x in nyse.columns.values if x not in fwm]
to_drop

In [None]:
nyse.drop(columns = to_drop,inplace = True)

In [None]:
# Reducing the data set to some of the important variables
nyse.head(5)

In [None]:
nyse.shape

In [None]:
len(nyse['TickerSymbol'].unique())

In [None]:
nyse_grp = nyse.groupby(['TickerSymbol'])
nyse_grp

In [None]:
nyse_agg = nyse_grp.agg(np.nanmean)
nyse_agg.head()

In [None]:
nyse_fn = nyse_agg.copy()

In [None]:
nyse_fn['TickerSymbol'] = nyse_fn.index
nyse_fn.head()

In [None]:
# scatter plots for visualising the 

fig, ax = plt.subplots(1,6, figsize = (16,6))
_=sns.scatterplot(data = nyse_fn, y = 'AccountsPayable', x = 'TickerSymbol', ax = ax[0])
_=sns.scatterplot(data = nyse_fn, y = 'AccountsReceivable', x = 'TickerSymbol', ax = ax[1])
_=sns.scatterplot(data = nyse_fn, y = 'GrossProfit', x = 'TickerSymbol', ax = ax[2])
_=sns.scatterplot(data = nyse_fn, y = 'TotalRevenue', x = 'TickerSymbol', ax = ax[3])
_=sns.scatterplot(data = nyse_fn, y = 'EarningsPerShare', x = 'TickerSymbol', ax = ax[4])
_=sns.scatterplot(data = nyse_fn, y = 'OperatingIncome', x = 'TickerSymbol', ax = ax[5])

In [None]:
import plotly.express as px

In [None]:
fig = px.density_heatmap(
                         nyse_fn,
                         x="TickerSymbol",
                         y="EarningsPerShare"
                         )
fig.show()

In [None]:
fig = px.density_heatmap(
                         nyse_fn,
                         x="TickerSymbol",
                         y="TotalRevenue",
                         )
fig.show()

In [None]:
fig = px.density_heatmap(
                         nyse_fn,
                         x="TickerSymbol",
                         y="OperatingIncome",
                         )
fig.show()

In [None]:
fig = px.density_heatmap(
                         nyse_fn,
                         x="TickerSymbol",
                         y="TotalRevenue",
                         )
fig.show()

Identification of number of clusters for the data
We are using here k-means clustering for which the number of clusters is to be predetermined. In order to achieve that we can draw a Scree Chart or check silhouette Plot.

In [None]:
nyse_fn.drop(columns = ['TickerSymbol'],inplace=True)
y_col = nyse_fn.columns[nyse_fn.isna().any(axis =0)]
fn_pred = nyse_fn[nyse_fn.isna().any(axis =1)]
fn_prem = nyse_fn[~nyse_fn.isna().any(axis =1)]
y = fn_prem[y_col]
fn_prem.drop(columns = y_col)
ss = StandardScaler()
X = ss.fit_transform(fn_prem)
X_train, X_test, y_train, y_test = train_test_split( X,y, test_size = 0.2)

In [None]:
dist = []
sil_sc = []
for i in range(1, 11):
    km = KMeans(n_clusters=i)
    km.fit(X_train)
    dist.append(km.inertia_)
    if(i>1) : 
        sil_sc.append(silhouette_score(X_train, km.labels_))
        
# plot
plt.title('Scree Plot')
plt.plot(range(1, 11), dist, marker='*')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

for i in range(0,9): print('\nFor ',(i+2),' clusters avg silhoutte score is #',sil_sc[i])
plt.plot(range(0, 10), dist, marker='*')
plt.title('Silhouette Plot')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

The number of cluster we choose is 4

In [None]:
colors = ['red', 'blue','orange','black']
km = KMeans(n_clusters=4)
clusters = km.fit(X_train)
labels = clusters.labels_
ctr = clusters.cluster_centers_

fig = plt.figure(figsize = (10,10))
fig.suptitle('K-mean clusters formed with n_cluster = 4',fontsize = 16)
for k, col in zip(range(X_train.shape[0]), colors):
    my_members = (labels == k)
    cluster_center = ctr[k]
    plt.plot(X_train[my_members, 0], X_train[my_members, 1], 'w', markerfacecolor=col, marker='.')
    plt.plot(cluster_center[0], cluster_center[1], '*', markerfacecolor=col,  markeredgecolor='k', markersize=6)

In [None]:
nyse_fn.isna().sum()

In [None]:
nyse_fn = nyse_fn.dropna(axis=1)

In [None]:
km = KMeans(n_clusters=5)
clusters = km.fit_predict(nyse_fn)
nyse_fn['cluster'] = clusters
nyse_fn.head(2)

In [None]:
nyse_fn['cluster'].value_counts()

In [None]:
nyse_fn.groupby('cluster')['TotalRevenue'].mean().plot(kind = 'barh')

In [None]:
nyse_fn.groupby('cluster')['OperatingIncome'].mean().plot(kind = 'barh')

In [None]:
nyse_fn.groupby('cluster')['GrossProfit'].mean().plot(kind = 'barh')

## Performing Gaussian mixture model

In [None]:
from sklearn.mixture import GaussianMixture

gm4 = GaussianMixture(n_components=4)

In [None]:
gm4 = GaussianMixture(
                     n_components = 4,
                     n_init = 200,  # The number of initializations 
                                   # to perform. The best results are kept.
                     max_iter = 400,
                     covariance_type= 'tied'
                     )  # 'tied', 'diag', 'full', 


In [None]:
gm4.fit(nyse_fn)

In [None]:
print("\n-----cluster means----\n")
gm4.means_
print("\n-----Did it converge?----\n")
gm4.converged_
print("\n-----How many iteratons?----\n")
gm4.n_iter_

In [None]:
silhouette_avg = silhouette_score(
                                  X = nyse_fn,
                                  metric = 'mahalanobis',
                                  labels =gm4.predict(nyse_fn)
                                  )
# 21.1
print("Silhoutte score as per Mahalanobis distance: ",silhouette_avg)

In [None]:
cid = gm4.predict(nyse_fn)

In [None]:
nyse_fn['cid'] = cid
nyse_fn.head()
nyse_fn['cid'].value_counts()

In [None]:
fig, ax = plt.subplots(1,3, figsize = (16,6))
sns.boxplot(x='cid', y='TotalRevenue', data=nyse_fn,ax = ax[0]);
sns.boxplot(x='cid', y='GrossProfit', data=nyse_fn,ax = ax[1]);
sns.boxplot(x='cid', y='OperatingIncome', data=nyse_fn,ax = ax[2]);

In [None]:
gm4.means_    # Shape is (n_cluster_components, n_features)
print()
gm4.means_.T  # Shape is (n_features, n_cluster_components)
print()
print()
np.corrcoef(gm4.means_.T)

## Clusters from KMeans and GMM--Compare

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16,6))

# 23.1 Cluster from GMM

sns.scatterplot(
                x='TickerSymbol',
                y='TotalRevenue',
                hue = 'cid',
                data=nyse_fn,
                palette="deep",
                ax = ax[0]
                );

# 23.2 Cluster from KMEans
sns.scatterplot(
                x='TickerSymbol',
                y='TotalRevenue',
                hue = 'cluster',
                data=nyse_fn,
                palette="deep",
                ax = ax[1]
                );       