Importing libraries and reading CSV files with pandas

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from plotly.offline import init_notebook_mode,iplot
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS
init_notebook_mode(connected=True)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

mutual_funds_df = pd.read_csv('../input/european-funds-dataset-from-morningstar/Morningstar - European Mutual Funds.csv')
etfs_df = pd.read_csv('../input/european-funds-dataset-from-morningstar/Morningstar - European ETFs.csv')

In [None]:
!pip install seaborn --upgrade
sns.__version__
# need to install seaborn 0.11 to display a density plot that compares mutual funds and ETFs in the same graph

In [None]:
mutual_funds_df.head(5)

In [None]:
etfs_df.head(5)

Histogram of funds by year of inception

In [None]:
mutual_funds_df = mutual_funds_df[mutual_funds_df.inception_date.notnull()]
mutual_funds_df['inception_date'] = pd.to_datetime(mutual_funds_df['inception_date'])
mutual_funds_df['inception_year'] = mutual_funds_df['inception_date'].apply(lambda x: x.year)

etfs_df = etfs_df[etfs_df.inception_date.notnull()]
etfs_df['inception_date'] = pd.to_datetime(etfs_df['inception_date'])
etfs_df['inception_year'] = etfs_df['inception_date'].apply(lambda x: x.year)

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(name='Mutual Funds', x=mutual_funds_df['inception_year']))
fig.add_trace(go.Histogram(name='ETFs', x=etfs_df['inception_year']))
fig.update_xaxes(range=[1980, 2020.5]) # 2020.5 to leave enough room for the ETFs count in 2020
fig.update_layout(title='Mutual Funds vs ETFs - Year of Inception', title_x=0.5,
                  xaxis_title_text='Year of Inception',
                  yaxis_title_text='Count of Funds')
fig.show()

Histogram of funds by management fees

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(name='Mutual Funds', x=mutual_funds_df['management_fees'], xbins=dict(start=0.1, end=2.1, size=0.05)))
fig.add_trace(go.Histogram(name='ETFs', x=etfs_df['management_fees'], xbins=dict(start=0.1, end=2.1, size=0.05)))
fig.update_layout(title='Mutual Funds vs ETFs - Management Fees', title_x=0.5,
                  xaxis_title_text='Management Fees',
                  yaxis_title_text='Count of Funds')
fig.show()

Histogram of funds by sustainability score

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(name='Mutual Funds', x=mutual_funds_df['sustainability_score'], xbins=dict(start=10, end=61, size=1)))
fig.add_trace(go.Histogram(name='ETFs', x=etfs_df['sustainability_score'], xbins=dict(start=10, end=61, size=1)))
fig.update_layout(title='Mutual Funds vs ETFs - Sustainability Score', title_x=0.5,
                  xaxis_title_text='Sustainability Score',
                  yaxis_title_text='Count of Funds')
fig.show()

Histogram of funds by YTD (Year-To-Date) return

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(name='Mutual Funds', x=mutual_funds_df['fund_trailing_return_ytd'], xbins=dict(start=-25, end=25, size=1)))
fig.add_trace(go.Histogram(name='ETFs', x=etfs_df['fund_trailing_return_ytd'], xbins=dict(start=-25, end=25, size=1)))
fig.update_layout(title='Mutual Funds vs ETFs - YTD Return', title_x=0.5,
                  xaxis_title_text='YTD Return',
                  yaxis_title_text='Count of Funds')
fig.show()

Histogram of funds by trailing return 10years

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(name='Mutual Funds', x=mutual_funds_df['fund_trailing_return_10years'], xbins=dict(start=-25, end=25, size=1)))
fig.add_trace(go.Histogram(name='ETFs', x=etfs_df['fund_trailing_return_10years'], xbins=dict(start=-25, end=25, size=1)))
fig.update_layout(title='Mutual Funds vs ETFs - Trailing Return 10years', title_x=0.5,
                  xaxis_title_text='Trailing Return 10years',
                  yaxis_title_text='Count of Funds')
fig.show()

Density plot of YTD (Year-To-Date) return based on year of inception - some plot features not available in the old seaborn v0.10

In [None]:
def returns_df(input_df, fund_category_name):
    input_df = input_df[(input_df.inception_date.notnull()) & (input_df.fund_trailing_return_ytd.notnull()) &
                        (input_df.sector_financial_services.notnull()) & (input_df.sector_healthcare.notnull()) & (input_df.sector_technology.notnull()) &
                        (input_df.fund_trailing_return_ytd < 100) & (input_df.sector_financial_services >= 0) & (input_df.sector_financial_services <= 100) &
                        (input_df.sector_healthcare >= 0) & (input_df.sector_healthcare <= 100) & (input_df.sector_technology >= 0) & (input_df.sector_technology <= 100)][[
        'isin', 'inception_date', 'fund_trailing_return_ytd', 'fund_trailing_return_10years', 'sector_financial_services', 'sector_healthcare', 'sector_technology']]
    input_df.loc[:, 'fund_type'] = fund_category_name
    return input_df

mutual_funds_returns_df = returns_df(mutual_funds_df, 'Fund Name')
etfs_returns_df = returns_df(etfs_df, 'ETF')

fund_returns_df = pd.concat([mutual_funds_returns_df, etfs_returns_df], axis=0)
fund_returns_df.set_index('isin', inplace=True)
fund_returns_df['inception_date'] = pd.to_datetime(fund_returns_df['inception_date'])
fund_returns_df['inception_year'] = fund_returns_df['inception_date'].apply(lambda x: x.year)
fund_returns_df['fund_trailing_return_ytd'] = pd.to_numeric(fund_returns_df['fund_trailing_return_ytd'])
fund_returns_df['fund_trailing_return_10years'] = pd.to_numeric(fund_returns_df['fund_trailing_return_10years'])
fund_returns_df['sector_financial_services'] = pd.to_numeric(fund_returns_df['sector_financial_services'])
fund_returns_df['sector_healthcare'] = pd.to_numeric(fund_returns_df['sector_healthcare'])
fund_returns_df['sector_technology'] = pd.to_numeric(fund_returns_df['sector_technology'])

In [None]:
jplot = sns.jointplot(data=fund_returns_df, x='inception_year', y='fund_trailing_return_ytd', height=12,
                      hue='fund_type', kind='kde', xlim=(1990, 2020), ylim=(-25, 25))
# seaborn v0.10 does not recognize the 'hue' parameter
jplot.set_axis_labels('Inception date', 'YTD return')
plt.tight_layout()

Regression plot of YTD (Year-To-Date) return based on portfolio percentage in the sectors Financial Services, Healthcare, and Technology

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(25, 12))
sns.regplot(data=fund_returns_df, x='sector_financial_services', y='fund_trailing_return_ytd', truncate=True, ax=ax[0])
sns.regplot(data=fund_returns_df, x='sector_healthcare', y='fund_trailing_return_ytd', truncate=True, ax=ax[1])
sns.regplot(data=fund_returns_df, x='sector_technology', y='fund_trailing_return_ytd', truncate=True, ax=ax[2])
ax[0].set(xlabel='Financial Services', ylabel='YTD Return')
ax[1].set(xlabel='Healthcare', ylabel='YTD Return')
ax[2].set(xlabel='Technology', ylabel='YTD Return')

Regression plot of 10years trailing return based on portfolio percentage in the sectors Financial Services, Healthcare, and Technology

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(25, 12))
sns.regplot(data=fund_returns_df, x='sector_financial_services', y='fund_trailing_return_10years', truncate=True, ax=ax[0])
sns.regplot(data=fund_returns_df, x='sector_healthcare', y='fund_trailing_return_10years', truncate=True, ax=ax[1])
sns.regplot(data=fund_returns_df, x='sector_technology', y='fund_trailing_return_10years', truncate=True, ax=ax[2])
ax[0].set(xlabel='Financial Services', ylabel='10years Trailing Return')
ax[1].set(xlabel='Healthcare', ylabel='10years Trailing Return')
ax[2].set(xlabel='Technology', ylabel='10years Trailing Return')

Horizontal histograms to compare the most common categories between Mutual Funds and ETFs

In [None]:
mutual_funds_count = mutual_funds_df['category'].value_counts()[:10]
etfs_count = etfs_df['category'].value_counts()[:10]

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
sns.barplot(mutual_funds_count.values, mutual_funds_count.index, ax=ax[0])
sns.barplot(etfs_count.values, etfs_count.index, ax=ax[1])
fig.tight_layout(pad=3.0)
ax[0].set(xlabel='Count of Mutual Funds')
ax[0].set_title('Most common Mutual Fund categories', size=16)
ax[1].set(xlabel='Count of ETFs')
ax[1].set_title('Most common ETF categories', size=16)

Analysis of Morningstar ratings

In [None]:
relevant_numeric_cols = ['rating', 'risk_rating', 'performance_rating', 'management_fees', 'equity_style_score', 'equity_size_score', 'asset_stock', 'asset_bond', 'asset_cash', 'asset_other',
                         'sector_basic_materials', 'sector_consumer_cyclical', 'sector_financial_services', 'sector_real_estate', 'sector_consumer_defensive', 'sector_healthcare',
                         'sector_utilities', 'sector_communication_services', 'sector_energy', 'sector_industrials', 'sector_technology', 'market_cap_giant', 'market_cap_large',
                         'market_cap_medium', 'market_cap_small', 'market_cap_micro', 'holdings_n_stock', 'holdings_n_bonds', 'sustainability_rank', 'environmental_score', 'social_score',
                         'governance_score', 'sustainability_score', 'fund_trailing_return_ytd', 'fund_trailing_return_3years', 'fund_trailing_return_5years', 'fund_trailing_return_10years',
                         'fund_return_2019', 'fund_return_2018', 'fund_return_2017', 'fund_return_2016', 'fund_return_2015', 'fund_return_2020_q3', 'fund_return_2020_q2',
                         'fund_return_2020_q1', 'fund_return_2019_q4', 'fund_return_2019_q3', 'fund_return_2019_q2', 'fund_return_2019_q1', 'fund_return_2018_q4', 'fund_return_2018_q3',
                         'fund_return_2018_q2', 'fund_return_2018_q1', 'fund_return_2017_q4', 'fund_return_2017_q3', 'fund_return_2017_q2', 'fund_return_2017_q1', 'fund_return_2016_q4',
                         'fund_return_2016_q3', 'fund_return_2016_q2', 'fund_return_2016_q1', 'fund_return_2015_q4', 'fund_return_2015_q3', 'fund_return_2015_q2', 'fund_return_2015_q1']

In [None]:
mutual_funds_ratings_df = mutual_funds_df.groupby('rating')[relevant_numeric_cols].agg('mean')
mutual_funds_ratings_df['count'] = mutual_funds_df.groupby('rating').size()
mutual_funds_ratings_df = mutual_funds_ratings_df.reindex(columns=['count'] + mutual_funds_ratings_df.columns.tolist()[:-1])
mutual_funds_ratings_df.head(5)

In [None]:
etfs_ratings_df = etfs_df.groupby('rating')[relevant_numeric_cols].agg('mean')
etfs_ratings_df['count'] = etfs_df.groupby('rating').size()
etfs_ratings_df = etfs_ratings_df.reindex(columns=['count'] + etfs_ratings_df.columns.tolist()[:-1])
etfs_ratings_df.head(5)

In [None]:
# the following Plotly code does not work on Kaggle

# fig = plotly.subplots.make_subplots(rows=1, cols=3, subplot_titles=('Proportion', 'Management Fees', '10years Trailing Return'),
#                                    specs=[[{'type': 'pie'}, {'type': 'scatter'}, {'type': 'scatter'}]])
# fig.add_trace(go.Pie(labels=mutual_funds_ratings_df['rating'], values=mutual_funds_ratings_df['count'],
#                      name='Proportions', sort=False, showlegend=True,
#                      hovertemplate='%{label}:<br>Popularity: %{percent}'), row=1, col=1)
# fig.add_trace(go.Scatter(x=mutual_funds_ratings_df['rating'], y=mutual_funds_ratings_df['management_fees'],
#                          marker=dict(size=mutual_funds_ratings_df['count']/15, color=mutual_funds_ratings_df['rating']),
#                          mode="markers", showlegend=False, hovertemplate='%{x}:<br>%{y:$.3f}'), row=1, col=2)
# fig.add_trace(go.Scatter(x=mutual_funds_ratings_df['rating'], y=mutual_funds_ratings_df['fund_trailing_return_10years'],
#                          marker=dict(size=mutual_funds_ratings_df['count']/15, color=mutual_funds_ratings_df['rating']),
#                          mode="markers", showlegend=False, hovertemplate='%{x}:<br>%{y:$.3f}'), row=1, col=3)
# fig.update_layout(title='Morningstar ratings', title_x=0.5,
#                  legend=dict(yanchor='bottom', y=-0.15, xanchor='left', x=0,
#                              font=dict(size=8), orientation='h'))
# fig['layout']['xaxis']['title'] = 'Morningstar rating'
# fig['layout']['xaxis2']['title'] = 'Morningstar rating'
# fig.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(25, 8))
fig.suptitle('Mutual Funds - Comparison of Fees and Returns based on Morningstar Ratings', fontsize=20)
ax[0].pie(mutual_funds_ratings_df['count'], labels=mutual_funds_ratings_df['rating'])
ax[0].axis('equal')
ax[0].set_title("Morningstar Ratings", fontsize=14)
ax[0].legend(frameon=False, bbox_to_anchor=(0.1, 0.2))
ax[1].scatter(mutual_funds_ratings_df['rating'], mutual_funds_ratings_df['management_fees'], s=1e3*mutual_funds_ratings_df['management_fees'], c=mutual_funds_ratings_df['rating'])
ax[1].set_xticks(mutual_funds_ratings_df['rating'])
ax[1].margins(x=0.25)
ax[1].set_xlabel('Morningstar Rating')
ax[1].set_ylabel('Management Fees')
ax[1].set_title("Management Fees based on Morningstar rating", fontsize=14)
ax[2].scatter(mutual_funds_ratings_df['rating'], mutual_funds_ratings_df['fund_trailing_return_10years'], s=1e2*mutual_funds_ratings_df['fund_trailing_return_10years'], c=mutual_funds_ratings_df['rating'])
ax[2].set_xticks(mutual_funds_ratings_df['rating'])
ax[2].margins(x=0.25)
ax[2].set_xlabel('Morningstar Rating')
ax[2].set_ylabel('10years Trailing Return')
ax[2].set_title("10years Trailing Return based on Morningstar rating", fontsize=14)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(25, 8))
fig.suptitle('ETFs - Comparison of Fees and Returns based on Morningstar Ratings', fontsize=20)
ax[0].pie(etfs_ratings_df['count'], labels=etfs_ratings_df['rating'])
ax[0].axis('equal')
ax[0].set_title("Morningstar Ratings", fontsize=14)
ax[0].legend(frameon=False, bbox_to_anchor=(0.1, 0.2))
ax[1].scatter(etfs_ratings_df['rating'], etfs_ratings_df['management_fees'], s=1e3*etfs_ratings_df['management_fees'], c=etfs_ratings_df['rating'])
ax[1].set_xticks(etfs_ratings_df['rating'])
ax[1].margins(x=0.25)
ax[1].set_xlabel('Morningstar Rating')
ax[1].set_ylabel('Management Fees')
ax[1].set_title("Management Fees based on Morningstar rating", fontsize=14)
ax[2].scatter(etfs_ratings_df['rating'], etfs_ratings_df['fund_trailing_return_10years'], s=1e2*etfs_ratings_df['fund_trailing_return_10years'], c=etfs_ratings_df['rating'])
ax[2].set_xticks(etfs_ratings_df['rating'])
ax[2].margins(x=0.25)
ax[2].set_xlabel('Morningstar Rating')
ax[2].set_ylabel('10years Trailing Return')
ax[2].set_title("10years Trailing Return based on Morningstar rating", fontsize=14)

Quarter trend based on Morningstar ratings 

In [None]:
def create_date_field(y, q):
    if q == 4:
        return str(int(y)) + '-12-31'
    if q == 3:
        return str(int(y)) + '-09-30'
    if q == 2:
        return str(int(y)) + '-06-30'
    if q == 1:
        return str(int(y)) + '-03-31'
    else:
        return ''

In [None]:
mutual_funds_quarterly_returns = mutual_funds_ratings_df.melt(id_vars=['rating'],
                                                              value_vars=['fund_return_2020_q3', 'fund_return_2020_q2', 'fund_return_2020_q1', 'fund_return_2019_q4',
                                                                          'fund_return_2019_q3', 'fund_return_2019_q2', 'fund_return_2019_q1', 'fund_return_2018_q4',
                                                                          'fund_return_2018_q3', 'fund_return_2018_q2', 'fund_return_2018_q1', 'fund_return_2017_q4',
                                                                          'fund_return_2017_q3', 'fund_return_2017_q2', 'fund_return_2017_q1', 'fund_return_2016_q4',
                                                                          'fund_return_2016_q3', 'fund_return_2016_q2', 'fund_return_2016_q1', 'fund_return_2015_q4',
                                                                          'fund_return_2015_q3', 'fund_return_2015_q2', 'fund_return_2015_q1'],
                                                              var_name='return_quarter', value_name='return_value')
mutual_funds_quarterly_returns['return_year'] = [int(x[-7:-3]) for x in mutual_funds_quarterly_returns['return_quarter']]
mutual_funds_quarterly_returns['return_quarter'] = [int(x[-1:]) for x in mutual_funds_quarterly_returns['return_quarter']]
mutual_funds_quarterly_returns['return_date'] = mutual_funds_quarterly_returns.apply(lambda mutual_funds_quarterly_returns:
                                                                                     create_date_field(mutual_funds_quarterly_returns['return_year'],
                                                                                                       mutual_funds_quarterly_returns['return_quarter']), axis=1)
mutual_funds_quarterly_returns['return_date'] = pd.to_datetime(mutual_funds_quarterly_returns['return_date'])
mutual_funds_quarterly_returns = mutual_funds_quarterly_returns[['rating', 'return_date', 'return_value']]
mutual_funds_quarterly_returns.head(5)

In [None]:
fig = px.line(mutual_funds_quarterly_returns, x='return_date', y='return_value', color='rating')
fig.update_layout(title='Mutual Funds - Line chart for quarterly returns based on the Morningstar rating', title_x=0.5,
                  xaxis_title='Date', yaxis_title='Quarterly return')
fig.show()

In [None]:
etfs_quarterly_returns = etfs_ratings_df.melt(id_vars=['rating'],
                                              value_vars=['fund_return_2020_q3', 'fund_return_2020_q2', 'fund_return_2020_q1', 'fund_return_2019_q4',
                                                          'fund_return_2019_q3', 'fund_return_2019_q2', 'fund_return_2019_q1', 'fund_return_2018_q4',
                                                          'fund_return_2018_q3', 'fund_return_2018_q2', 'fund_return_2018_q1', 'fund_return_2017_q4',
                                                          'fund_return_2017_q3', 'fund_return_2017_q2', 'fund_return_2017_q1', 'fund_return_2016_q4',
                                                          'fund_return_2016_q3', 'fund_return_2016_q2', 'fund_return_2016_q1', 'fund_return_2015_q4',
                                                          'fund_return_2015_q3', 'fund_return_2015_q2', 'fund_return_2015_q1'],
                                              var_name='return_quarter', value_name='return_value')
etfs_quarterly_returns['return_year'] = [int(x[-7:-3]) for x in etfs_quarterly_returns['return_quarter']]
etfs_quarterly_returns['return_quarter'] = [int(x[-1:]) for x in etfs_quarterly_returns['return_quarter']]
etfs_quarterly_returns['return_date'] = etfs_quarterly_returns.apply(lambda etfs_quarterly_returns:
                                                                     create_date_field(etfs_quarterly_returns['return_year'],
                                                                                       etfs_quarterly_returns['return_quarter']), axis=1)
etfs_quarterly_returns['return_date'] = pd.to_datetime(etfs_quarterly_returns['return_date'])
etfs_quarterly_returns = etfs_quarterly_returns[['rating', 'return_date', 'return_value']]
etfs_quarterly_returns.head(5)

In [None]:
fig = px.line(etfs_quarterly_returns, x='return_date', y='return_value', color='rating')
fig.update_layout(title='ETFs - Line chart for quarterly returns based on the Morningstar rating', title_x=0.5,
                  xaxis_title='Date', yaxis_title='Quarterly return')
fig.show()

Boxplots of fund returns grouped by category

In [None]:
common_fund_categories = ['Other Equity', 'Other Bond', 'Global Emerging Markets Equity', 'Global Large-Cap Blend Equity']

mutual_fund_categories_df = mutual_funds_df[mutual_funds_df.category.isin(common_fund_categories)].sort_values('category', ascending=True)
etf_categories_df = etfs_df[etfs_df.category.isin(common_fund_categories)].sort_values('category', ascending=True)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
sns.boxplot(data=mutual_fund_categories_df, x='category', y='management_fees', ax=ax[0])
sns.boxplot(data=etf_categories_df, x='category', y='management_fees', ax=ax[1])
fig.tight_layout(pad=3.0)
ax[0].set(xlabel='Mutual Fund Category', ylabel='Management Fees')
ax[0].set_title('Management Fees by Mutual Fund Category', size=16)
ax[0].set_ylim([0, 3])
ax[1].set(xlabel='ETF Category', ylabel='Management Fees')
ax[1].set_title('Management Fees by ETF Category', size=16)
ax[1].set_ylim([0, 3])

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
sns.boxplot(data=mutual_fund_categories_df, x='category', y='fund_trailing_return_ytd', ax=ax[0])
sns.boxplot(data=etf_categories_df, x='category', y='fund_trailing_return_ytd', ax=ax[1])
fig.tight_layout(pad=3.0)
ax[0].set(xlabel='Mutual Fund Category', ylabel='YTD Return')
ax[0].set_title('YTD Return by Mutual Fund Category', size=16)
ax[0].set_ylim([-50, 50])
ax[1].set(xlabel='ETF Category', ylabel='YTD Return')
ax[1].set_title('YTD Return by ETF Category', size=16)
ax[1].set_ylim([-50, 50])

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
sns.boxplot(data=mutual_fund_categories_df, x='category', y='fund_trailing_return_3years', ax=ax[0])
sns.boxplot(data=etf_categories_df, x='category', y='fund_trailing_return_3years', ax=ax[1])
fig.tight_layout(pad=3.0)
ax[0].set(xlabel='Mutual Fund Category', ylabel='3years Return')
ax[0].set_title('3years Return by Mutual Fund Category', size=16)
ax[0].set_ylim([-25, 25])
ax[1].set(xlabel='ETF Category', ylabel='3years Return')
ax[1].set_title('3years Return by ETF Category', size=16)
ax[1].set_ylim([-25, 25])

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))
sns.boxplot(data=mutual_fund_categories_df, x='category', y='fund_trailing_return_10years', ax=ax[0])
sns.boxplot(data=etf_categories_df, x='category', y='fund_trailing_return_10years', ax=ax[1])
fig.tight_layout(pad=3.0)
ax[0].set(xlabel='Mutual Fund Category', ylabel='10years Return')
ax[0].set_title('10years Return by Mutual Fund Category', size=16)
ax[0].set_ylim([-20, 20])
ax[1].set(xlabel='ETF Category', ylabel='10years Return')
ax[1].set_title('10years Return by ETF Category', size=16)
ax[1].set_ylim([-20, 20])

Crosstab of fund categories and Morningstar ratings

In [None]:
large_cap_categories = ['Global Large-Cap Blend Equity', 'Global Large-Cap Growth Equity', 'Global Large-Cap Value Equity']

mutual_fund_large_cap_df = mutual_funds_df[mutual_funds_df.category.isin(large_cap_categories)].sort_values('category', ascending=True)
etf_large_cap_df = etfs_df[etfs_df.category.isin(large_cap_categories)].sort_values('category', ascending=True)

In [None]:
mutual_fund_morningstar_ratings = pd.crosstab(mutual_fund_large_cap_df.category, mutual_fund_large_cap_df.rating, margins=True)
mutual_fund_morningstar_ratings.style.background_gradient(cmap='summer_r')

In [None]:
etf_category_morningstar_ratings = pd.crosstab(etf_large_cap_df.category, etf_large_cap_df.rating, margins=True)
etf_category_morningstar_ratings.style.background_gradient(cmap='summer_r')

Wordclouds of investment strategy to Mutual Funds and ETFs (funds with high stocks investment and then funds with high bonds investment)

In [None]:
stopwords = set(['Fund', 'fund'] + list(STOPWORDS))

def topicWordCloud(subset):
    wordcloud = WordCloud(width=1800, height=1200,
                      background_color='white',
                      stopwords=stopwords,
                      max_words=200,
                      min_font_size=20,
                      random_state=42).generate(str(subset))
    return wordcloud

In [None]:
mutual_funds_stocks_df = mutual_funds_df[mutual_funds_df['asset_stock'] >= 75]['investment_strategy']
etfs_stocks_df = etfs_df[etfs_df['asset_stock'] >= 75]['investment_strategy']

fig = plt.figure(figsize=(18, 12), facecolor=None)
for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i+1 == 1:
        subset= mutual_funds_stocks_df
        plot_title = 'Mutual Funds - Stocks'
    elif i+1 == 2:
        subset = etfs_stocks_df
        plot_title = 'ETFs - Stocks'
    wordcloud = topicWordCloud(subset)
    plt.title(plot_title)
    ax.imshow(wordcloud)
    ax.axis('off')

In [None]:
mutual_funds_bonds_df = mutual_funds_df[mutual_funds_df['asset_bond'] >= 75]['investment_strategy']
etfs_bonds_df = etfs_df[etfs_df['asset_bond'] >= 75]['investment_strategy']

fig = plt.figure(figsize=(18, 12), facecolor=None)
for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i+1 == 1:
        subset= mutual_funds_bonds_df
        plot_title = 'Mutual Funds - Bonds'
    elif i+1 == 2:
        subset = etfs_bonds_df
        plot_title = 'ETFs - Bonds'
    wordcloud = topicWordCloud(subset)
    plt.title(plot_title)
    ax.imshow(wordcloud)
    ax.axis('off')