In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
fundamentals_df=pd.read_csv('../input/fundamentals-df/fundamentals.csv')
fundamentals_df.columns

In [None]:
fundamentals_df.columns = fundamentals_df.columns.str.replace(r"[\"\/\.\',]", '')
fundamentals_df.columns = fundamentals_df.columns.str.replace(" ", "_")
fundamentals_df['Ticker_Symbol']=fundamentals_df['Ticker_Symbol'].astype("category")
fundamentals_df.drop(columns='Unnamed:_0',inplace=True)
fundamentals_df.drop(columns='For_Year',inplace=True)

In [None]:
ticker_group=fundamentals_df.groupby("Ticker_Symbol").mean().reset_index()

x=pd.DataFrame(ticker_group.corr())


In [None]:
#we can check for profit margin ticker group wise
sns.jointplot(x='Ticker_Symbol', y='Profit_Margin',data= ticker_group)

# Removing NaNs from the data

In [None]:
#removing NANs in the dataframe
is_NaN = fundamentals_df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = fundamentals_df[row_has_NaN]
rows_with_NaN #['Current_Ratio'].isnull().values.any()
#df['your column name'].isnull().values.any()


In [None]:
for i in rows_with_NaN.columns:
    print(i,rows_with_NaN[i].isnull().values.any())

In [None]:
#since most of the rows donot have earnings per share and cash ration, the columns earnings per share, estimated shares outstanding, cash ratio are removed
fundamentals_df.drop(columns='Earnings_Per_Share',inplace=True)
fundamentals_df.drop(columns='Estimated_Shares_Outstanding',inplace=True)
fundamentals_df.drop(columns='Cash_Ratio',inplace=True)
fundamentals_df.drop(columns='Quick_Ratio',inplace=True)
fundamentals_df.drop(columns='Current_Ratio',inplace=True)

#Standardizing the data and applying Kmeans clustering

In [None]:
b=fundamentals_df['Period_Ending'].str.split('-',n=1, expand=True)
fundamentals_df['Year_Ending']=b[0]
#dropping period ending column
fundamentals_df.drop(columns='Period_Ending',inplace=True)

In [None]:
#splitting data into x and y label
Y=fundamentals_df['Ticker_Symbol']
x=fundamentals_df.drop(columns='Ticker_Symbol')

scaler = StandardScaler()
x_scale = scaler.fit_transform(x)
X=pd.DataFrame(x_scale)
X.columns=x.columns
X

In [None]:
#Apply KMeans clustering and get inertia, Silhoutte score and plot the inertia
inertia = []
s_score = []


# 2.4
for i in range(2,10):
    # 2.4.1 The model
    model = KMeans(n_clusters = i, max_iter = 300)
    model.fit(X)     
    # 2.4.2 Store inertia value
    inertia.append(model.inertia_)
    # 2.4.3 Calculate silhoutte score
    t=silhouette_score(X.values, model.labels_, sample_size = 10000)
    s_score.append(t)
    print(s_score)
  

In [None]:
import matplotlib.pyplot as plt
inertia
plt.figure()
plt.plot(inertia)
plt.show()

In [None]:
s_score

plt.figure()
plt.plot(s_score)
plt.show()

#Performing Guassian Mixture Modelling on the data

In [None]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(
                     n_components = 4,
                     n_init = 30,
                     max_iter = 200)


gm.fit(X)

# 20.5 Where are the clsuter centers
print("\n-----cluster means----\n")
gm.means_

# 20.6 Did algorithm converge?
print("\n-----Did it converge?----\n")
gm.converged_

# 20.7 How many iterations did it perform?
print("\n-----How many iteratons?----\n")
gm.n_iter_

In [None]:
silhouette_avg = silhouette_score(X, gm.predict(X))
silhouette_avg

In [None]:
cid = gm.predict(X)
fundamentals_df['cid']=cid
fundamentals_df[['Ticker_Symbol','cid']]

In [None]:
count_df=fundamentals_df[['Ticker_Symbol','cid']].value_counts()


In [None]:
count_df