In [1]:
"""This exercise aims to explore investment strategies on ETFs/mutual funds and leverage
Natural Language Processing techniques to see what insights/predictive capabilities can be gained from it."""

'This exercise aims to explore investment strategies on ETFs/mutual funds and leverage\nNatural Language Processing techniques to see what insights/predictive capabilities can be gained from it.'

In [3]:
import numpy as np
import pandas as pd

df_etfs = pd.read_csv("Morningstar - European ETFs.csv")
df_mutual_funds = pd.read_csv("Morningstar - European Mutual Funds.csv")

df_etfs.head()

Unnamed: 0,ticker,isin,fund_name,inception_date,category,rating,analyst_rating,risk_rating,performance_rating,investment_strategy,...,fund_return_2016_q4,fund_return_2016_q3,fund_return_2016_q2,fund_return_2016_q1,fund_return_2015_q4,fund_return_2015_q3,fund_return_2015_q2,fund_return_2015_q1,quarters_up,quarters_down
0,0P00000C5S,CH0008899764,iShares SMI® ETF (CH),1999-10-06,Switzerland Equity,3.0,Negative,3.0,3.0,The Fund aims to achieve a return on your inve...,...,1.45,5.1,10.47,-4.09,4.18,-4.0,-3.96,10.56,15,8
1,0P00000FYY,SE0000693293,XACT OMXS30 UCITS ETF,2000-10-30,Sweden Equity,2.0,,2.0,2.0,The fund is an exchange-traded index fund and ...,...,5.21,11.32,1.66,1.43,4.99,-6.15,-7.21,10.11,16,7
2,0P00000GCE,DE000A0H08N1,iShares STOXX Europe 600 Personal & Household ...,2002-07-08,Sector Equity Consumer Goods & Services,3.0,,3.0,3.0,iShares STOXX Europe 600 Personal & Household ...,...,-1.51,5.34,8.17,6.48,4.83,4.12,-2.03,9.24,17,6
3,0P00000GF2,DE0006289465,iShares eb.rexx® Government Germany UCITS ETF ...,2003-02-04,EUR Government Bond,2.0,,2.0,2.0,iShares eb.rexx Government Germany (DE) is an ...,...,-2.14,4.21,6.04,9.67,0.05,4.96,-3.72,-5.67,13,10
4,0P00000GKU,IE0032523478,iShares € Corp Bond Large Cap UCITS ETF EUR (D...,2003-03-17,EUR Corporate Bond,3.0,,3.0,3.0,The investment objective of this Fund is to pr...,...,-2.74,5.58,6.26,10.55,1.38,3.78,-5.02,-5.84,16,7


In [4]:
df_mutual_funds.head()

Unnamed: 0,ticker,isin,fund_name,inception_date,category,rating,analyst_rating,risk_rating,performance_rating,investment_strategy,...,fund_return_2016_q4,fund_return_2016_q3,fund_return_2016_q2,fund_return_2016_q1,fund_return_2015_q4,fund_return_2015_q3,fund_return_2015_q2,fund_return_2015_q1,quarters_up,quarters_down
0,0P00000BOW,LU0011983433,Morgan Stanley Investment Funds - Global Bond ...,1989-11-01,Global Bond,5.0,,3.0,5.0,The Global Bond Fund's investment objective is...,...,-2.06,5.32,10.26,7.52,2.22,3.68,-7.67,3.27,16,7
1,0P00000ESH,LU0757425763,Threadneedle (Lux) - American Select Class AU ...,2000-07-28,US Large-Cap Growth Equity,2.0,,3.0,2.0,The American Select Portfolio seeks to achieve...,...,8.94,8.33,8.67,0.46,8.85,-3.7,-4.42,5.9,17,6
2,0P00000ESL,LU0011818076,HSBC Global Investment Funds - Economic Scale ...,1987-01-16,Japan Large-Cap Equity,2.0,,1.0,2.0,The sub-fund aims to provide long term total r...,...,6.77,13.15,7.59,-3.61,12.0,-8.66,-1.15,14.08,14,9
3,0P00000FI7,LU0152882725,Capital Group Global Equity Fund (LUX) B,2000-10-06,Global Large-Cap Growth Equity,2.0,,2.0,2.0,Long-term capital growth through investment pr...,...,4.61,9.14,6.83,0.67,8.47,-6.78,-4.57,7.7,18,5
4,0P00000FIA,LU0114999294,Capital Group Global Equity Fund (LUX) B,2000-10-06,Global Large-Cap Growth Equity,2.0,,2.0,2.0,Long-term capital growth through investment pr...,...,4.41,9.66,6.28,0.82,8.18,-6.99,-4.39,7.31,18,5


In [7]:
#Extracting the investment strategies
etfs_strategies = df_etfs[['investment_strategy']].dropna().reset_index(drop=True)
mutual_funds_strategies = df_mutual_funds[['investment_strategy']].dropna().reset_index(drop=True)

# Combine both for a holistic view
combined_strategies = pd.concat([etfs_strategies, mutual_funds_strategies], ignore_index=True)
combined_strategies.columns = ['investment_strategy']

# Display the first few rows of combined strategies
combined_strategies.head()

Unnamed: 0,investment_strategy
0,The Fund aims to achieve a return on your inve...
1,The fund is an exchange-traded index fund and ...
2,iShares STOXX Europe 600 Personal & Household ...
3,iShares eb.rexx Government Germany (DE) is an ...
4,The investment objective of this Fund is to pr...


In [9]:
#Text preprocessing
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define preprocessing functions
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return ' '.join(lemmatized_tokens)

# Apply preprocessing to the text data
combined_strategies['cleaned_strategy'] = combined_strategies['investment_strategy'].apply(preprocess_text)

word_counts = Counter(" ".join(combined_strategies['cleaned_strategy']).split())

common_words_df = pd.DataFrame(word_counts.most_common(20), columns=['Word', 'Frequency'])

common_words_df.head(15)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RyanMcKiernan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RyanMcKiernan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RyanMcKiernan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Word,Frequency
0,fund,98450
1,investment,86226
2,security,68825
3,market,57044
4,company,48843
5,index,48481
6,asset,47135
7,equity,46915
8,capital,37920
9,return,36720


In [10]:
#Pretty generic of a response, let's see what phrases are the most common
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english', max_features=20)
X = vectorizer.fit_transform(combined_strategies['cleaned_strategy'])

phrases = vectorizer.get_feature_names_out()
frequencies = X.sum(axis=0).A1

common_phrases_df = pd.DataFrame({'Phrase': phrases, 'Frequency': frequencies}).sort_values(by='Frequency', ascending=False)

common_phrases_df.head(15)

Unnamed: 0,Phrase,Frequency
12,investment objective,22435
2,capital growth,18647
4,emerging market,17446
19,total return,16934
16,net asset,13188
14,long term,12205
8,fund invest,12039
10,fund seek,11631
7,fund aim,11040
17,objective fund,10932


In [11]:
#Phrases with 4-6 words
vectorizer2 = CountVectorizer(ngram_range=(4, 6), stop_words='english', max_features=20)
X2 = vectorizer2.fit_transform(combined_strategies['cleaned_strategy'])

phrases2 = vectorizer2.get_feature_names_out()
frequencies2 = X2.sum(axis=0).A1

common_phrases_df2 = pd.DataFrame({'Phrase': phrases2, 'Frequency': frequencies2}).sort_values(by='Frequency', ascending=False)

common_phrases_df2.head(15)

Unnamed: 0,Phrase,Frequency
4,equity equity related security,3286
9,long term capital growth,2483
1,aim provide long term,2294
10,long term total return,2095
0,aim provide capital growth,1804
16,security money market instrument,1786
19,total return investing portfolio,1714
5,fund aim provide capital,1707
13,provide long term total,1704
14,provide long term total return,1704


In [12]:
#Applying LDA
from sklearn.decomposition import LatentDirichletAllocation

# Extracting the `investment_strategy` column and filtering categories related to commodities
etfs_with_categories = df_etfs[['category', 'investment_strategy']].dropna().reset_index(drop=True)
mutual_funds_with_categories = df_mutual_funds[['category', 'investment_strategy']].dropna().reset_index(drop=True)

# Combine datasets
combined_with_categories = pd.concat([etfs_with_categories, mutual_funds_with_categories], ignore_index=True)

# Filter for commodities-related strategies
commodities_strategies = combined_with_categories[
    combined_with_categories['category'].str.contains('Commodities', case=False, na=False)
]

# Preprocess the strategies
import re
def preprocess_text_custom(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove non-alphabetic characters
    tokens = text.split()
    stopwords = {'the', 'and', 'of', 'to', 'in', 'on', 'for', 'a', 'is', 'with', 'by', 'this', 'an', 'or', 'be', 'as', 'it'}
    filtered_tokens = [word for word in tokens if word not in stopwords]
    return ' '.join(filtered_tokens)

commodities_strategies['cleaned_strategy'] = commodities_strategies['investment_strategy'].apply(preprocess_text_custom)

# Extract commodities-related strategies and preprocess
commodities_strategies_texts = commodities_strategies['cleaned_strategy']

# Vectorize text using CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_commodities = vectorizer.fit_transform(commodities_strategies_texts)

# Apply Latent Dirichlet Allocation (LDA) for topic modeling
n_topics = 5  # Define the number of topics
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X_commodities)

# Get the terms for each topic
terms = vectorizer.get_feature_names_out()
topics = []
for topic_idx, topic in enumerate(lda.components_):
    topics.append({
        'Topic': f'Topic {topic_idx + 1}',
        'Top Terms': ', '.join([terms[i] for i in topic.argsort()[:-11:-1]])  # Top 10 terms
    })

# Create a DataFrame for the topics
topics_df = pd.DataFrame(topics)

topics_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commodities_strategies['cleaned_strategy'] = commodities_strategies['investment_strategy'].apply(preprocess_text_custom)


Unnamed: 0,Topic,Top Terms
0,Topic 1,"commodity, investment, instruments, portfolio,..."
1,Topic 2,"index, daily, designed, securities, exposure, ..."
2,Topic 3,"collateral, exchange, held, traded, return, tr..."
3,Topic 4,"index, fund, commodity, return, performance, i..."
4,Topic 5,"gold, physical, investment, silver, fund, perf..."


In [13]:
"""
Some insights from those topics:
- Topic 1: Broad Commodity Investment Instruments
    - Top Terms: commodity, investment, instruments, portfolio, diversification
    - This topic focuses on the use of commodities as part of diversified investment portfolios.
    - Emphasis is placed on commodities as an asset class for mitigating risks and improving returns.
    - Likely strategies involve a mix of various commodities (e.g., metals, energy, agriculture) for balanced exposure.
- Topic 2: Index-Based Commodity Exposure
    - Top Terms: index, daily, designed, securities, exposure, benchmarks
    - Highlights strategies that track commodity indices.
    - Daily adjustments and benchmarking are key features, suggesting index-tracking funds aimed at passive investors.
    - This reflects a focus on mirroring market performance rather than active commodity trading.
- Topic 3: Collateral and Exchange-Traded Commodities
    - Top Terms: collateral, exchange, held, traded, return, transparency
    - This topic centers on the mechanics of exchange-traded commodities (ETCs), emphasizing collateralization and transparency.
    - Strategies likely prioritize secure and regulated trading environments, appealing to risk-averse investors.
    - Suggests a focus on commodities like futures contracts or collateral-backed instruments.
- Topic 4: Performance and Risk Management
    - Top Terms: index, fund, commodity, return, performance, instruments
    - A recurring theme of performance tracking and risk management.
    - Indicates strategies designed to maximize returns while staying within predefined risk parameters.
    - May focus on actively managed commodity funds aiming to outperform standard indices.
- Topic 5: Precious Metals (Gold and Silver)
    - Top Terms: gold, physical, investment, silver, fund, performance
    - Precious metals, particularly gold and silver, form a distinct thematic strategy.
    - Strategies often involve physical holdings or ETFs focusing on precious metals, valued for their stability and inflation-hedging properties.
    - This is a prominent category for investors seeking safe-haven assets.
"""

'\nSome insights from those topics:\n- Topic 1: Broad Commodity Investment Instruments\n    - Top Terms: commodity, investment, instruments, portfolio, diversification\n    - This topic focuses on the use of commodities as part of diversified investment portfolios.\n    - Emphasis is placed on commodities as an asset class for mitigating risks and improving returns.\n    - Likely strategies involve a mix of various commodities (e.g., metals, energy, agriculture) for balanced exposure.\n- Topic 2: Index-Based Commodity Exposure\n    - Top Terms: index, daily, designed, securities, exposure, benchmarks\n    - Highlights strategies that track commodity indices.\n    - Daily adjustments and benchmarking are key features, suggesting index-tracking funds aimed at passive investors.\n    - This reflects a focus on mirroring market performance rather than active commodity trading.\n- Topic 3: Collateral and Exchange-Traded Commodities\n    - Top Terms: collateral, exchange, held, traded, retur

In [19]:
#Sentiment/Tone Analysis
sampled_strategies = commodities_strategies[['investment_strategy', 'cleaned_strategy']].sample(10, random_state=42)
sampled_strategies.head()

Unnamed: 0,investment_strategy,cleaned_strategy
8006,The investment objective of the MULTI UNITS LU...,investment objective multi units luxembourg ly...
7625,The investment objective of the MULTI UNITS LU...,investment objective multi units luxembourg ly...
1101,WisdomTree Natural Gas (1689) is designed to e...,wisdomtree natural gas designed enable investo...
2021,The investment seeks to track the price and yi...,investment seeks track price yield performance...
959,WisdomTree Softs (AIGS) is designed to enable ...,wisdomtree softs aigs designed enable investor...


In [21]:
#Label definition
conservative_keywords = {'stable', 'low risk', 'preservation', 'conservative', 'income'}
aggressive_keywords = {'high risk', 'growth', 'speculative', 'leverage', 'opportunity'}

# Function to assign tone based on keywords
def assign_tone(strategy):
    strategy_words = set(strategy.split())
    if strategy_words & conservative_keywords:
        return 'Conservative'
    elif strategy_words & aggressive_keywords:
        return 'Aggressive'
    else:
        return 'Neutral'

# Apply the labeling function to the cleaned strategies
commodities_strategies['Tone'] = commodities_strategies['cleaned_strategy'].apply(assign_tone)

# Display the labeled dataset for review
labeled_sample = commodities_strategies[['investment_strategy', 'Tone']].sample(15, random_state=42)
labeled_sample.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commodities_strategies['Tone'] = commodities_strategies['cleaned_strategy'].apply(assign_tone)


Unnamed: 0,investment_strategy,Tone
8006,The investment objective of the MULTI UNITS LU...,Neutral
7625,The investment objective of the MULTI UNITS LU...,Neutral
1101,WisdomTree Natural Gas (1689) is designed to e...,Neutral
2021,The investment seeks to track the price and yi...,Neutral
959,WisdomTree Softs (AIGS) is designed to enable ...,Neutral
2022,The investment seeks to track the price and yi...,Neutral
40411,The Fund aims to provide capital growth over 3...,Aggressive
1103,WisdomTree Petroleum (AIGO) is designed to ena...,Neutral
1710,"The investment seeks to track, before fees and...",Neutral
2540,ETFS 2x Daily Long Brent Crude (LBRT) is desig...,Neutral


In [23]:
#Model training
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Prepare the data
X = commodities_strategies['cleaned_strategy']
y = commodities_strategies['Tone']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_tfidf, y_train)

# Evaluate the classifier
y_pred = clf.predict(X_test_tfidf)
classification_report_output = classification_report(y_test, y_pred, output_dict=True)

# Convert classification report to DataFrame for better readability
classification_report_df = pd.DataFrame(classification_report_output).transpose()
classification_report_df.head()

Unnamed: 0,precision,recall,f1-score,support
Aggressive,1.0,1.0,1.0,17.0
Conservative,1.0,0.333333,0.5,3.0
Neutral,0.985612,1.0,0.992754,137.0
accuracy,0.987261,0.987261,0.987261,0.987261
macro avg,0.995204,0.777778,0.830918,157.0


In [25]:
#Combo strats
combined_strategies['cleaned_strategy'] = combined_strategies['investment_strategy'].apply(preprocess_text_custom)
combined_strategies['Tone'] = combined_strategies['cleaned_strategy'].apply(assign_tone)
new_sample = combined_strategies.sample(500, random_state=42)

In [27]:
#BERT Model
from transformers import BertTokenizer, TFBertForSequenceClassification, Trainer, TrainingArguments
import tensorflow as tf
from torch.utils.data import Dataset

X = new_sample['cleaned_strategy']
y = new_sample['Tone'].map({'Conservative': 0, 'Aggressive': 1, 'Neutral': 2})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
def tokenize_data(texts, labels, tokenizer, max_len):
    inputs = tokenizer(
        list(texts),
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors="tf"
    )
    return inputs, tf.convert_to_tensor(labels)

train_inputs, train_labels = tokenize_data(X_train, y_train, tokenizer, max_len=128)
test_inputs, test_labels = tokenize_data(X_test, y_test, tokenizer, max_len=128)

# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# Train the model
history = model.fit(
    {
        'input_ids': train_inputs['input_ids'],
        'attention_mask': train_inputs['attention_mask']
    },
    train_labels,
    validation_data=(
        {
            'input_ids': test_inputs['input_ids'],
            'attention_mask': test_inputs['attention_mask']
        },
        test_labels
    ),
    epochs=3,
    batch_size=16
)

# Evaluate the model
evaluation = model.evaluate(
    {
        'input_ids': test_inputs['input_ids'],
        'attention_mask': test_inputs['attention_mask']
    },
    test_labels
)
print("Evaluation results:", evaluation)






All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Epoch 2/3
Epoch 3/3
Evaluation results: [0.042411766946315765, 1.0]


In [29]:
def make_predictions(texts, tokenizer, model):
    inputs = tokenizer(
        texts,
        max_length=128,
        padding=True,
        truncation=True,
        return_tensors="tf"
    )
    outputs = model(
        {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask']
        }
    )
    probs = tf.nn.softmax(outputs.logits, axis=-1)
    predictions = tf.argmax(probs, axis=1).numpy()
    return predictions, probs

# Example predictions
example_texts = [
    "This fund focuses on stable returns and preservation of capital.",
    "A high-risk strategy designed for aggressive growth.",
    "A balanced approach to investments with moderate risks."
]
predictions, probabilities = make_predictions(example_texts, tokenizer, model)
print("Predictions:", predictions)
print("Probabilities:", probabilities)

Predictions: [0 1 2]
Probabilities: tf.Tensor(
[[0.7608755  0.02337302 0.21575147]
 [0.00812007 0.97597665 0.01590325]
 [0.01226412 0.01205056 0.97568524]], shape=(3, 3), dtype=float32)
