In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Load data to Dataframe
df_data = pd.read_csv(r'BLOOMBERGDATA.csv')
df_data = df_data.dropna()
df_data = df_data.replace(np.inf,999999)

# Create Input Variable Dataframe
X = df_data.drop(columns = ['Name','Gr PoP of  Price over 5 Years','Ticker','Check Values','Industry'])

# Create Output Dataframe
y = df_data['Check Values']

# Split Dataframes to train and test 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.5)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a model Tree with 5000 decision nodes 
model = RandomForestClassifier(n_estimators=5000)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

# Create a predictions dataframe of X_Test
predict_df = pd.DataFrame(predictions)
predict_df = predict_df.rename(columns={0: 'Predicted Values'})

In [3]:
yTestArray = np.array(y_test)
test_df = pd.DataFrame(yTestArray)
test_df = test_df.rename(columns={0: 'Real Values'})
vals = pd.concat([test_df,predict_df],axis=1)
vals.to_csv(r'predicted_test.csv')

# Create a 'Test Passed' column to see which predictions are equal to the Real Values
vals['Test Passed'] = [1 if vals.loc[ei,'Predicted Values']==vals.loc[ei,'Real Values'] else 0 for ei in vals.index]

vals

Unnamed: 0,Real Values,Predicted Values,Test Passed
0,UNDERPERFORM,UNDERPERFORM,1
1,ABOVE 200,UNDERPERFORM,0
2,UNDERPERFORM,UNDERPERFORM,1
3,UNDERPERFORM,UNDERPERFORM,1
4,UNDERPERFORM,UNDERPERFORM,1
...,...,...,...
110,UNDERPERFORM,UNDERPERFORM,1
111,UNDERPERFORM,UNDERPERFORM,1
112,UNDERPERFORM,UNDERPERFORM,1
113,ABOVE 100,UNDERPERFORM,0


In [4]:
# Calculate the percentage of tests that passed
print(vals['Test Passed'].sum()/vals.count())

Real Values         0.443478
Predicted Values    0.443478
Test Passed         0.443478
dtype: float64


In [5]:
underperforming = vals.set_index(['Real Values'])
underperforming = underperforming.loc['UNDERPERFORM']
underperforming

Unnamed: 0_level_0,Predicted Values,Test Passed
Real Values,Unnamed: 1_level_1,Unnamed: 2_level_1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1
UNDERPERFORM,UNDERPERFORM,1


In [6]:
y_pass = underperforming['Test Passed'].sum()
x = underperforming.count()

# print the amount of tests passed when the 'Real Value' is UNDERPERFORMING
y_pass/x

Predicted Values    0.818182
Test Passed         0.818182
dtype: float64

In [7]:
top_performing_100 = vals
top_performing_100 = top_performing_100.loc[top_performing_100['Predicted Values'] =='ABOVE 100']
top_performing_100['Check Top Passed'] = [1 if top_performing_100.loc[ei,'Real Values'] == 'UNDERPERFORM' else 0 for ei in top_performing_100.index]
y_pass = top_performing_100['Check Top Passed'].sum()
x = top_performing_100.count()

#Print the amount of tests that beat the market (Above 100% over 5 years) when predicted with Above 100 returns

1 - y_pass/x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Real Values         0.6
Predicted Values    0.6
Test Passed         0.6
Check Top Passed    0.6
dtype: float64

In [8]:
top_performing_200 = vals
top_performing_200 = top_performing_200.loc[top_performing_200['Predicted Values'] =='ABOVE 200']
top_performing_200['Check Top Passed'] = [1 if top_performing_200.loc[ei,'Real Values'] == 'UNDERPERFORM' else 0 for ei in top_performing_200.index]
y_pass = top_performing_200['Check Top Passed'].sum()
x = top_performing_200.count()

#Print the amount of tests that beat the market (Above 100% over 5 years) when predicted with Above 200 returns

1 - y_pass/x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Real Values         0.666667
Predicted Values    0.666667
Test Passed         0.666667
Check Top Passed    0.666667
dtype: float64

In [9]:
top_performing_300 = vals
top_performing_300 = top_performing_300.loc[top_performing_300['Predicted Values'] =='ABOVE 300']
top_performing_300['Check Top Passed'] = [1 if top_performing_300.loc[ei,'Real Values'] == 'UNDERPERFORM' else 0 for ei in top_performing_300.index]
y_pass = top_performing_300['Check Top Passed'].sum()
x = top_performing_300.count()

#Print the amount of tests that beat the market (Above 100% over 5 years) when predicted with Above 300 returns

y_pass/x

Real Values        NaN
Predicted Values   NaN
Test Passed        NaN
Check Top Passed   NaN
dtype: float64

In [10]:
top_performing_400 = vals
top_performing_400 = top_performing_400.loc[top_performing_400['Predicted Values'] =='ABOVE 400']
top_performing_400['Check Top Passed'] = [1 if top_performing_400.loc[ei,'Real Values'] == 'UNDERPERFORM' else 0 for ei in top_performing_400.index]
y_pass = top_performing_400['Check Top Passed'].sum()
x = top_performing_400.count()


#Print the amount of tests that beat the market (Above 100% over 5 years) when predicted with Above 400 returns

1 - y_pass/x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Real Values         1.0
Predicted Values    1.0
Test Passed         1.0
Check Top Passed    1.0
dtype: float64