In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 200
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/mutual-funds-and-etfs/Mutual Funds.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# definition funds whose description of investment strategy includes 'quantitative' as quant funds
print('number of funds:', df['fund_symbol'].nunique())
df['quant'] = (df['investment_strategy'].str.contains('quantitative')).astype(int)
print('number of quant funds:', df[df.quant==1]['fund_symbol'].nunique())

In [None]:
print('number of fund companies: ', df['fund_family'].nunique())
print('number of fund companies running at least one quant fund:',
      df[df.quant==1]['fund_family'].nunique())
print('the fund companies running at least one quant fund are listed below:\n',
      df[df.quant==1]['fund_family'].value_counts())

In [None]:
# number of new funds do not increase as steady as average funds
df['inception_date'] = pd.to_datetime(df.inception_date)
df['inception_year'] = df['inception_date'].dt.year
ax = df[df.quant==1].groupby('inception_year')['fund_symbol'].count().plot(label='quant fund')
ax1 = df.groupby('inception_year')['fund_symbol'].count().plot(secondary_y=True,label = 'fund')
ax.set_ylabel('number of new quant fund')
ax1.set_ylabel('fund')
plt.legend(['quant fund','fund'])
plt.show()

In [None]:
# quant funds: more on blend, and large

In [None]:
plt.figure(0)
temp = df.groupby('investment_type')['fund_symbol'].count().to_frame()
plt.pie(temp.fund_symbol,labels=temp.index,autopct='%.2f%%')
plt.title('Investment types of all mutual funds')
plt.figure(1)
temp = df[df.quant==1].groupby('investment_type')['fund_symbol'].count().to_frame()
plt.pie(temp.fund_symbol,labels=temp.index,autopct='%.2f%%')
plt.title('Investment types of quant funds')
plt.show()

In [None]:
plt.figure(0)
temp = df.groupby('size_type')['fund_symbol'].count().to_frame()
plt.pie(temp.fund_symbol,labels=temp.index,autopct='%.2f%%')
plt.title('Size types of all mutual funds')
plt.figure(1)
temp = df[df.quant==1].groupby('size_type')['fund_symbol'].count().to_frame()
plt.pie(temp.fund_symbol,labels=temp.index,autopct='%.2f%%')
plt.title('Size types of quant funds')
plt.show()

In [None]:
# the fraction of quant funds performing better than category is lower \
# than the fraction of total mutual funds performing better than category
pct_fund_better_than_category = []
pct_quant_fund_better_than_category = []
return_var = ['ytd','1month','3months','1year','3years','5years','10years',
            '2019','2018','2017','2016','2015','2014','2013','2012','2011','2010']
for var in return_var:
    df['fund return '+ var + ' > category'] = (df['fund_return_'+var] > df['category_return_'+var]).astype(int)
    pct_fund_better_than_category.append(
        df['fund return '+ var + ' > category'].mean())
    pct_quant_fund_better_than_category.append(
        df[df.quant==1]['fund return '+ var + ' > category'].mean())
    

fig,ax = plt.subplots(figsize=(15,9))
index = np.arange(len(return_var))

bar_width = 0.4
ax1 = plt.bar(index,pct_fund_better_than_category,width=bar_width,
             label='all mutual funds')
ax2 = plt.bar(index+bar_width,pct_quant_fund_better_than_category,
              width=bar_width,label='quant funds')

plt.xlabel('return')
plt.ylabel('fraction')
plt.xticks(index+bar_width,return_var)
plt.title('Fraction of funds performing better than category average')
plt.legend()
plt.show()

In [None]:
# on average, quant funds have a lower (fund return-category return)
mean_fund_ret_minus_category = []
mean_quant_fund_ret_minus_category = []
return_var = ['ytd','1month','3months','1year','3years','5years','10years',
            '2019','2018','2017','2016','2015','2014','2013','2012','2011','2010']
for var in return_var:
    df['fund return '+ var + ' minus category'] = \
    df['fund_return_'+var] - df['category_return_'+var]

    # quant funds have a 

    mean_fund_ret_minus_category.append(
        df['fund return '+ var + ' minus category'].mean())
    mean_quant_fund_ret_minus_category.append(
        df[df.quant==1]['fund return '+ var + ' minus category'].mean())
    

fig,ax = plt.subplots(figsize=(15,9))
index = np.arange(len(return_var))

bar_width = 0.4
ax1 = plt.bar(index,pct_fund_better_than_category,width=bar_width,
             label='all mutual funds')
ax2 = plt.bar(index+bar_width,pct_quant_fund_better_than_category,
              width=bar_width,label='quant funds')

plt.xlabel('return')
plt.ylabel('mean return')
plt.xticks(index+bar_width,return_var)
plt.title('Averages of fund returns over category return')
plt.legend()
plt.show()

In [None]:
fraction_fund_alpha_larger_than_category = []
fraction_quant_fund_alpha_larger_than_category = []
alpha_var = ['3years','5years','10years']
for var in alpha_var:
    df['fund alpha '+ var + ' > category'] = \
    (df['fund_alpha_'+var] > df['category_alpha_'+var]).astype(int)
    
    fraction_fund_alpha_larger_than_category.append(
        df['fund alpha '+ var + ' > category'].mean())
    fraction_quant_fund_alpha_larger_than_category.append(
        df[df.quant==1]['fund alpha '+ var + ' > category'].mean())
    

fig,ax = plt.subplots(figsize=(15,9))
index = np.arange(len(alpha_var))

bar_width = 0.4
ax1 = plt.bar(index,fraction_fund_alpha_larger_than_category,width=bar_width,
             label='all mutual funds')
ax2 = plt.bar(index+bar_width,fraction_quant_fund_alpha_larger_than_category,
              width=bar_width,label='quant funds')

plt.xlabel('alpha')
plt.ylabel('fraction')
plt.xticks(index+bar_width,alpha_var)
plt.title('Fraction of funds with higher alpha than category')
plt.legend()
plt.show()

In [None]:
# quant funds have slightly lower standard deviation in 5years and 10years
fraction_fund_std_larger_than_category = []
fraction_quant_fund_std_larger_than_category = []
std_var = ['3years','5years','10years']
for var in std_var:
    df['fund standard deviation '+ var + ' > category'] = (df['fund_standard_deviation_'+var] > df['category_standard_deviation_'+var]).astype(int)
    
    fraction_fund_std_larger_than_category.append(
        df['fund standard deviation '+ var + ' > category'].mean())
    fraction_quant_fund_std_larger_than_category.append(
        df[df.quant==1]['fund standard deviation '+ var + ' > category'].mean())
    

fig,ax = plt.subplots(figsize=(15,9))
index = np.arange(len(std_var))

bar_width = 0.4
ax1 = plt.bar(index,fraction_fund_std_larger_than_category,width=bar_width,
             label='all mutual funds')
ax2 = plt.bar(index+bar_width,fraction_quant_fund_std_larger_than_category,
              width=bar_width,label='quant funds')

plt.xlabel('standard deviation')
plt.ylabel('fraction')
plt.xticks(index+bar_width,std_var)
plt.title('Fraction of funds with higher standard deviation than category')
plt.legend()
plt.show()

In [None]:
# Do not use sharpe ratio variable. Because category sharpe ratio (many zeros) here are not consistent with category return (non-zero and large than risk free).
# some treynor ratio is type of 'object', probably because of missing values

In [None]:
# quant funds concentrate more on lower ratings (1 or 2), i.e.lower return and higher risk
rating = ['rating','return_rating','risk_rating']
for var in rating:                        
                              
    fig,ax = plt.subplots()

    index = np.arange(5) # rating is from 1 to 5
    bar_width=0.4


    ax = plt.bar(index+1,df[var].value_counts(normalize=True).to_frame()[var],
            width = bar_width,label='all mutual funds')
    ax1 = plt.bar(index+1+bar_width,df[df.quant==1][var].value_counts(normalize=True).to_frame()[var],
            width = bar_width,label='quant funds')
    
    plt.xlabel('rating')
    plt.ylabel('fraction of funds')
    plt.title('Distribution of ' + var)
    plt.legend()

    plt.show()
    plt.clf()

In [None]:
# on average, quant funds are more expensive
df['fund expense minus category'] = \
df['fund_net_annual_expense_ratio'] - df['category_net_annual_expense_ratio']

print('average (fund expense-category expense)of all funds:',
     df['fund expense minus category'].mean())
print('average (fund expense-category expense)of quant funds:',
     df[df.quant==1]['fund expense minus category'].mean())