In [3]:
# This version of Scikit-learn is needed for our PyCaret library
!pip install --user -U scikit-learn==0.23.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.23.2


In [2]:
!pip uninstall scikit-learn -y

# !pip install -U scikit-learn



In [4]:
import sklearn

sklearn.__version__

'0.23.2'

In [5]:
# Install PyCaret.
# If working on Colab, PyCaret library needs to be installed everytime we are trying to access it.

!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
# Check current version is up to date with PyCaret website

from pycaret.utils import version
version()

'2.3.10'

In [9]:
# Import local files to Google Colab
from google.colab import files
uploaded = files.upload()

Saving config.py to config (1).py


In [8]:
# Import dependencies
# from pycaret.regression import *
from pycaret.classification import *
import pandas as pd
import numpy as np 
import psycopg2
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import imblearn
from config import db_password

In [10]:
# Create connection to AWS database

connection = psycopg2.connect(
    host = 'launch-it-1.cyo6pvehqvyz.us-east-1.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = db_password,
    database='launch-it-1'
    )
cursor=connection.cursor()

In [33]:
# Using pandas to execute SQL queries
sql = """
SELECT * from launchit
"""
df = pd.read_sql(sql, con=connection)

In [34]:
# Display dataframe content
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,debtToEquity,currentRatio,forwardPE
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,12.272,2.707,-9.666667
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,12.272,2.707,-9.666667
2,2019-12-23,8.79,8.79,7.25,7.81,7.81,117400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,12.272,2.707,-9.666667
3,2019-12-24,7.5,7.64,6.0,6.41,6.41,102800.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,12.272,2.707,-9.666667
4,2019-12-26,6.42,7.72,6.42,7.41,7.41,78400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,12.272,2.707,-9.666667


In [35]:
# Check Data Types for all columns
df.dtypes

Date            datetime64[ns]
Open                   float64
High                   float64
Low                    float64
Close                  float64
Adj Close              float64
Volume                 float64
TCKR                    object
sector                  object
industry                object
country                 object
growth_rate            float64
debtToEquity           float64
currentRatio           float64
forwardPE              float64
dtype: object

In [14]:
# # Up for discussion, keep or remove. We could also use onehotencoder instead.
# # Convert sector,industry and country columns categories from a string to integer.
# # .cat.codes converts a category from a string representation into an integer representation.
# df['sector']=df['sector'].astype('category').cat.codes
# df['country']=df['country'].astype('category').cat.codes
# df['industry']=df['industry'].astype('category').cat.codes
# df.head(2)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,debtToEquity,currentRatio,forwardPE
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,4,58,10,1.206,12.272,2.707,-9.666667
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,4,58,10,1.206,12.272,2.707,-9.666667


In [36]:
# Calculate the correlation
df.corr()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,growth_rate,debtToEquity,currentRatio,forwardPE
Open,1.0,0.99563,0.998601,0.997769,0.997624,0.060514,0.033618,-0.028558,0.004966,-0.045869
High,0.99563,1.0,0.994052,0.996194,0.996062,0.062784,0.034063,-0.029083,0.005953,-0.045935
Low,0.998601,0.994052,1.0,0.999059,0.998898,0.058822,0.032975,-0.027849,0.003913,-0.045636
Close,0.997769,0.996194,0.999059,1.0,0.999851,0.061291,0.033762,-0.028575,0.005112,-0.046015
Adj Close,0.997624,0.996062,0.998898,0.999851,1.0,0.061302,0.033936,-0.031015,0.005459,-0.045704
Volume,0.060514,0.062784,0.058822,0.061291,0.061302,1.0,-0.007527,-0.019929,-0.005551,-0.010027
growth_rate,0.033618,0.034063,0.032975,0.033762,0.033936,-0.007527,1.0,-0.019604,0.033603,-0.006325
debtToEquity,-0.028558,-0.029083,-0.027849,-0.028575,-0.031015,-0.019929,-0.019604,1.0,-0.11376,-0.082439
currentRatio,0.004966,0.005953,0.003913,0.005112,0.005459,-0.005551,0.033603,-0.11376,1.0,-0.013715
forwardPE,-0.045869,-0.045935,-0.045636,-0.046015,-0.045704,-0.010027,-0.006325,-0.082439,-0.013715,1.0


In [38]:
# Create Correlation matrix based on target variable: In this case "growth_rate", "Volume" or "currentRatio"
corr_matrix = df.corr().round(3)
print(corr_matrix["currentRatio"].sort_values(ascending=False))

currentRatio    1.000
growth_rate     0.034
High            0.006
Open            0.005
Close           0.005
Adj Close       0.005
Low             0.004
Volume         -0.006
forwardPE      -0.014
debtToEquity   -0.114
Name: currentRatio, dtype: float64


In [None]:
# # PyCaret Regression Setup command:
# reg = setup(data = df, target = 'Volume', session_id = 789, train_size = 0.80,
#             # ignore_features=["TCKR","Adj Close"],
#             feature_selection = True,
#             transformation= True,
#             remove_outliers=True,
#             high_cardinality_features= ['country','sector']
#             )

In [39]:
# PyCaret Classification Setup Command:
clf = setup(data = df, target = 'currentRatio', session_id = 789, train_size = 0.80
            # ignore_features=["TCKR","Adj Close"]
            # feature_selection = True,
            # high_cardinality_features= ['country','sector']
            )

Unnamed: 0,Description,Value
0,session_id,789
1,Target,currentRatio
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(257624, 15)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=[],
                                      target='currentRatio',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'p

In [40]:
models()

INFO:logs:gpu_param set to False


Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [41]:
# best_clf_model = compare_models()
compare_models()

INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 2
INFO:logs:[]
INFO:logs:compare_models() succesfully completed......................................


[]