In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# paths for files
path_str_mf = "../input/mutual-funds-and-etfs/Mutual Funds.csv"


This code takes the mutual funds dataset and normalizes it for selected categories so that it will be easier to pivot.
I removed some of the less commonly used indicators and historical data because the resulting data set was too large.

In [None]:
# create dataframe from data
df_mf = pd.read_csv(path_str_mf)
df_mf.shape


In [None]:
list(df_mf.columns) 

In [None]:
# Create separate unpivoted dataframes for different groups of columns

# master table
df_fund = df_mf[[
    'fund_symbol',
    'fund_extended_name',
    'fund_family',
    'category',
    'investment_strategy',
    'investment_type',
    'size_type'
]]

df_fund.head()

In [None]:
summary_stats_cols = [
    'rating',
    'return_rating',
    'risk_rating',
    'fund_net_annual_expense_ratio',
    'category_net_annual_expense_ratio',
    'net_asset_value',
    'fund_yield',
]

df_sum_stats = pd.melt(df_mf, id_vars =['fund_symbol'], value_vars = summary_stats_cols)

df_sum_stats['data_category'] = 'summary stats'

df_sum_stats.head()

In [None]:
asset_cols = [
     'asset_cash',
     'asset_stocks',
     'asset_bonds',
     'asset_others'
]


df_asset = pd.melt(df_mf, id_vars = ['fund_symbol'], value_vars =asset_cols)

df_asset['data_category'] = 'asset percentages'

df_asset.head()

In [None]:
sector_cols = [
     'sector_basic_materials',
     'sector_consumer_cyclical',
     'sector_financial_services',
     'sector_real_estate',
     'sector_consumer_defensive',
     'sector_healthcare',
     'sector_utilities',
     'sector_communication_services',
     'sector_energy',
     'sector_industrials',
     'sector_technology'
]

df_sectors = pd.melt(df_mf, id_vars =['fund_symbol'], value_vars =sector_cols)

df_sectors['data_category'] = 'sector percentages'

df_sectors.head()

In [None]:
return_cols = [
 'fund_return_1year',
 'category_return_1year',
 'fund_return_3years',
 'category_return_3years',
 'fund_return_5years',
 'category_return_5years',
 'fund_return_10years',
 'category_return_10years'
]

df_return = pd.melt(df_mf, id_vars =['fund_symbol'], value_vars =return_cols)

# create column for data category for consolidated table
df_return['data_category'] = 'fund return'

df_return.head()

In [None]:
beta_cols = [
 'fund_beta_3years',
 'category_beta_3years',
 'fund_beta_5years',
 'category_beta_5years',
 'fund_beta_10years',
 'category_beta_10years'
]

df_beta = pd.melt(df_mf, id_vars =['fund_symbol'], value_vars =beta_cols)

# create column for data category for consolidated table
df_beta['data_category'] = 'beta'

df_beta.head()



In [None]:
#  list of data frames to append

append_frames = [df_asset, df_beta, df_return, df_sectors, df_sum_stats]

df_sub_frames  = pd.concat(append_frames)

df_sub_frames.head()

In [None]:
# number of rows
df_sub_frames.shape

In [None]:
# join subframes data to master table
df_consol = pd.merge(df_fund, df_sub_frames, on='fund_symbol', how='left')

df_consol.sample(10)


In [None]:
df_consol.shape

In [None]:
# drop 
try:
    os.remove("./df_consol.csv")
except FileNotFoundError as e:
    print("File does not exist.")

#df_consol.to_csv('df_consol.csv',index=False)