# Unsupervised Learning Models

### Loading Dataset from AWS Database

In [1]:
from config import db_password

In [2]:
import psycopg2
connection = psycopg2.connect(
    host = 'launch-it-1.cyo6pvehqvyz.us-east-1.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = db_password,
    database='launch-it-1'
    )
cursor=connection.cursor()

In [24]:
# using pandas to execute SQL queries

import pandas as pd

sql = """
SELECT * FROM launchit  
"""
project_df = pd.read_sql(sql, con=connection)
project_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
2,2019-12-23,8.79,8.79,7.25,7.81,7.81,117400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
3,2019-12-24,7.5,7.64,6.0,6.41,6.41,102800.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
4,2019-12-26,6.42,7.72,6.42,7.41,7.41,78400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0


### Data Processing

In [25]:
# List the column names
project_df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'TCKR',
       'sector', 'industry', 'country', 'growth_rate', 'target_median_price',
       'target_mean_price', 'target_high_price'],
      dtype='object')

In [26]:
# View the data types
project_df.dtypes

Date                   datetime64[ns]
Open                          float64
High                          float64
Low                           float64
Close                         float64
Adj Close                     float64
Volume                        float64
TCKR                           object
sector                         object
industry                       object
country                        object
growth_rate                   float64
target_median_price           float64
target_mean_price             float64
target_high_price             float64
dtype: object

In [27]:
# List and count all of the values in the Name column
project_df.TCKR.value_counts()

INTZ    7673
ALRS    4944
EVO     4722
ABST    3457
MEDS    2204
        ... 
EE       143
ACON     137
TNON     134
HNVR     124
PFHC     121
Name: TCKR, Length: 547, dtype: int64

In [28]:
# Find null values
project_df.isnull().sum()

Date                   0
Open                   0
High                   0
Low                    0
Close                  0
Adj Close              0
Volume                 0
TCKR                   0
sector                 0
industry               0
country                0
growth_rate            0
target_median_price    0
target_mean_price      0
target_high_price      0
dtype: int64

In [29]:
# Find duplicate entries
project_df.duplicated().sum()

0

In [30]:
# Describe the project column to observe range
project_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,growth_rate,target_median_price,target_mean_price,target_high_price
count,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0
mean,26.583393,27.382844,25.748342,26.540951,26.366506,1569110.0,9.231909,42.769452,42.769452,61.148451
std,41.118378,42.49606,39.734576,40.961183,40.96719,6202717.0,91.521475,215.51016,215.51016,306.290318
min,0.001,0.001,0.001,0.001,0.001,0.0,-0.984,0.68,0.68,0.75
25%,7.73,8.0,7.47725,7.71,7.49,69200.0,0.02,9.5,9.5,14.5
50%,16.67,17.139999,16.16,16.639999,16.299999,308850.0,0.251,19.05,19.05,26.0
75%,30.0,30.879999,29.118751,29.99,29.77,1059300.0,0.589,32.0,32.0,47.0
max,1531.0,2299.0,1227.0,1277.0,1277.0,412008300.0,1404.333,4164.94,4164.94,6597.92


### Transform DataFrame

In [88]:
# Make a new copy of the project_df
df1 = project_df.copy()

In [89]:
# Transform volume
df1.Volume = project_df.Volume / 100000
df1.Volume.head()

0    0.894
1    5.030
2    1.174
3    1.028
4    0.784
Name: Volume, dtype: float64

In [90]:
# Describe the Volume column to observe range
df1.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,growth_rate,target_median_price,target_mean_price,target_high_price
count,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0
mean,26.583393,27.382844,25.748342,26.540951,26.366506,15.691103,9.231909,42.769452,42.769452,61.148451
std,41.118378,42.49606,39.734576,40.961183,40.96719,62.027173,91.521475,215.51016,215.51016,306.290318
min,0.001,0.001,0.001,0.001,0.001,0.0,-0.984,0.68,0.68,0.75
25%,7.73,8.0,7.47725,7.71,7.49,0.692,0.02,9.5,9.5,14.5
50%,16.67,17.139999,16.16,16.639999,16.299999,3.0885,0.251,19.05,19.05,26.0
75%,30.0,30.879999,29.118751,29.99,29.77,10.593,0.589,32.0,32.0,47.0
max,1531.0,2299.0,1227.0,1277.0,1277.0,4120.083,1404.333,4164.94,4164.94,6597.92


In [91]:
df1.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'TCKR',
       'sector', 'industry', 'country', 'growth_rate', 'target_median_price',
       'target_mean_price', 'target_high_price'],
      dtype='object')

Non-numeric columns = 'Date', 'TCKR', 'sector', 'industry', 'country'

In [92]:
# Drop the Date column
df1 = project_df.drop(labels='Date', axis=1)
df1.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
1,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
2,8.79,8.79,7.25,7.81,7.81,117400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
3,7.5,7.64,6.0,6.41,6.41,102800.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
4,6.42,7.72,6.42,7.41,7.41,78400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0


In [93]:
# Label Encode non-numeric columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['TCKR'] = le.fit_transform(df1['TCKR'])
df1['sector'] = le.fit_transform(df1['sector'])
df1['industry'] = le.fit_transform(df1['industry'])
df1['country'] = le.fit_transform(df1['country'])
df1.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,11.0,11.96,10.5,10.7,10.7,89400.0,242,4,59,10,1.206,15.0,15.0,15.0
1,10.808,12.49,9.25,9.65,9.65,503000.0,242,4,59,10,1.206,15.0,15.0,15.0
2,8.79,8.79,7.25,7.81,7.81,117400.0,242,4,59,10,1.206,15.0,15.0,15.0
3,7.5,7.64,6.0,6.41,6.41,102800.0,242,4,59,10,1.206,15.0,15.0,15.0
4,6.42,7.72,6.42,7.41,7.41,78400.0,242,4,59,10,1.206,15.0,15.0,15.0


In [94]:
# Describe transformed dataframe
df1.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
count,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0,294310.0
mean,26.583393,27.382844,25.748342,26.540951,26.366506,1569110.0,266.09437,5.755635,44.165771,21.495478,9.231909,42.769452,42.769452,61.148451
std,41.118378,42.49606,39.734576,40.961183,40.96719,6202717.0,156.391223,2.569531,27.029823,8.386829,91.521475,215.51016,215.51016,306.290318
min,0.001,0.001,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,-0.984,0.68,0.68,0.75
25%,7.73,8.0,7.47725,7.71,7.49,69200.0,134.0,5.0,14.0,25.0,0.02,9.5,9.5,14.5
50%,16.67,17.139999,16.16,16.639999,16.299999,308850.0,263.0,6.0,48.0,26.0,0.251,19.05,19.05,26.0
75%,30.0,30.879999,29.118751,29.99,29.77,1059300.0,397.0,9.0,76.0,26.0,0.589,32.0,32.0,47.0
max,1531.0,2299.0,1227.0,1277.0,1277.0,412008300.0,546.0,10.0,92.0,27.0,1404.333,4164.94,4164.94,6597.92


In [95]:
# Drop the stock data columns
df1 = df1.drop(labels=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'TCKR'], axis=1)
df1.head()

Unnamed: 0,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,4,59,10,1.206,15.0,15.0,15.0
1,4,59,10,1.206,15.0,15.0,15.0
2,4,59,10,1.206,15.0,15.0,15.0
3,4,59,10,1.206,15.0,15.0,15.0
4,4,59,10,1.206,15.0,15.0,15.0


In [96]:
df1

Unnamed: 0,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,4,59,10,1.206,15.0,15.0,15.0
1,4,59,10,1.206,15.0,15.0,15.0
2,4,59,10,1.206,15.0,15.0,15.0
3,4,59,10,1.206,15.0,15.0,15.0
4,4,59,10,1.206,15.0,15.0,15.0
...,...,...,...,...,...,...,...
294305,2,49,26,0.325,7.0,7.0,7.0
294306,2,49,26,0.325,7.0,7.0,7.0
294307,2,49,26,0.325,7.0,7.0,7.0
294308,2,49,26,0.325,7.0,7.0,7.0


In [97]:
# Find duplicate entries
df1.duplicated().sum()

293763

In [98]:
# Drop duplicates
df1 = df1.drop_duplicates()
df1

Unnamed: 0,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,4,59,10,1.206,15.00,15.00,15.0
726,9,76,5,0.172,1.59,1.59,2.5
1456,9,76,26,0.375,75.00,75.00,100.0
2186,9,76,26,1.558,205.00,205.00,300.0
2917,7,1,5,0.202,4.00,4.00,4.0
...,...,...,...,...,...,...,...
293317,6,12,25,0.000,5.00,5.00,5.0
293502,9,17,26,0.900,17.75,17.75,20.0
293698,2,35,26,0.198,6.00,6.00,6.0
293896,5,5,26,-0.078,29.00,29.00,39.0


### K-mean Algorithm

In [99]:
# Import dependencies
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [100]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column
    df["class"] = model.labels_

In [101]:
# Try plotting two variables
df1.hvplot.scatter(x='sector', y='growth_rate')

In [103]:
test_cluster_amount(df1, 2)
df1.hvplot.scatter(x='sector', y='country', by='class')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [105]:
fig = px.scatter_3d(
    df1,
    x='industry',
    y='sector',
    z='country',
    color='class',
    symbol='class',
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [113]:
# Drop the class column
df1 = df1.drop(labels=['class'], axis=1)
df1.head()

Unnamed: 0,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,4,59,10,1.206,15.0,15.0,15.0
726,9,76,5,0.172,1.59,1.59,2.5
1456,9,76,26,0.375,75.0,75.0,100.0
2186,9,76,26,1.558,205.0,205.0,300.0
2917,7,1,5,0.202,4.0,4.0,4.0


### Elbow Curve

In [114]:
# Set up for elbow curve
inertia = []
k = list(range(1, 11))

In [115]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df1)
    inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [116]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

### Pricipal Component Analysis

In [109]:
# Import dependencies
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [117]:
# Standardize data with StandardScaler
df1_scaled = StandardScaler().fit_transform(df1)
print(df1_scaled[0:5])

[[-0.67483379  0.50727479 -1.48518124 -0.07210751 -0.12490989 -0.12490989
  -0.14347298]
 [ 1.23579811  1.1359764  -2.1060878  -0.08598442 -0.18213564 -0.18213564
  -0.18101521]
 [ 1.23579811  1.1359764   0.50171974 -0.08326004  0.13113373  0.13113373
   0.11181419]
 [ 1.23579811  1.1359764   0.50171974 -0.06738346  0.6858949   0.6858949
   0.71248987]
 [ 0.47154535 -1.6377072  -2.1060878  -0.0855818  -0.17185122 -0.17185122
  -0.17651014]]


In [118]:
# Initialize PCA model
pca = PCA(n_components=5)

In [119]:
# Get principal components for the df1 data
df1_pca = pca.fit_transform(df1_scaled)

In [120]:
# Transform PCA data to a DataFrame
df_df1_pca = pd.DataFrame(
    data=df1_pca, columns=['PC1','PC2','PC3','PC4','PC5']
)
df_df1_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-0.211851,0.446897,-1.128799,0.952116,0.741696
1,-0.29902,-1.12257,-1.529409,1.904702,-0.207345
2,0.183606,-1.731744,0.272794,0.115013,-0.046124
3,1.170501,-1.746579,0.295282,0.116972,-0.028584
4,-0.224846,1.251701,-1.409877,1.105906,-1.615379


In [121]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.42590823, 0.2040946 , 0.14758921, 0.13139752, 0.08814394])

In [122]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_df1_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [134]:
# Initialize the K-means model
model = KMeans(n_clusters=2, random_state=0)

# Fit the model
model.fit(df_df1_pca)

# Predict clusters
predictions = model.predict(df_df1_pca)

# Add the predicted class columns
df_df1_pca["class"] = model.labels_
df_df1_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,class
0,-0.211851,0.446897,-1.128799,0.952116,0.741696,0
1,-0.29902,-1.12257,-1.529409,1.904702,-0.207345,0
2,0.183606,-1.731744,0.272794,0.115013,-0.046124,0
3,1.170501,-1.746579,0.295282,0.116972,-0.028584,0
4,-0.224846,1.251701,-1.409877,1.105906,-1.615379,0


In [135]:
fig = px.scatter_3d(
    df_df1_pca,
    x='PC1',
    y='PC2',
    z='PC3',
    color='class',
    symbol='class',
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [139]:
# Drop the class column
df_df1_pca = df_df1_pca.drop(labels=['class'], axis=1)
df1.head()

Unnamed: 0,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,4,59,10,1.206,15.0,15.0,15.0
726,9,76,5,0.172,1.59,1.59,2.5
1456,9,76,26,0.375,75.0,75.0,100.0
2186,9,76,26,1.558,205.0,205.0,300.0
2917,7,1,5,0.202,4.0,4.0,4.0


### Hierarchical Clustering

In [137]:
# Import dependencies
from sklearn.cluster import AgglomerativeClustering
import plotly.figure_factory as ff

In [140]:
# Create the dendrogram
fig = ff.create_dendrogram(df_df1_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

In [141]:
# Agglomerative clustering
agg = AgglomerativeClustering(n_clusters=2)
model = agg.fit(df_df1_pca)

# Add a new class column to df_df1_pca
df_df1_pca['class'] = model.labels_
df_df1_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,class
0,-0.211851,0.446897,-1.128799,0.952116,0.741696,0
1,-0.29902,-1.12257,-1.529409,1.904702,-0.207345,0
2,0.183606,-1.731744,0.272794,0.115013,-0.046124,0
3,1.170501,-1.746579,0.295282,0.116972,-0.028584,0
4,-0.224846,1.251701,-1.409877,1.105906,-1.615379,0


In [142]:
fig = px.scatter_3d(
    df_df1_pca,
    x='PC1',
    y='PC2',
    z='PC3',
    color='class',
    symbol='class',
    width=800,
)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()