# Step A | Data Preprocessing

## Importing the data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import the stock_prices data
stocks = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv")
# Display the first observations of the attributes
stocks.head()

## Imputer to missing values

In [None]:
# Import the SimpleImputer class from the sklearn.impute library
from sklearn.impute import SimpleImputer
# Save the SimpleImputer() class in imputer
imputer = SimpleImputer()
# Loc the High and Low Features 
hl_stocks = stocks.loc[ : , ['Volume', 'Open']]
# Fill the missing values with the mean of the observations
hl_stocks = imputer.fit_transform(hl_stocks)
# Convert the ndarray to pandas dataframe
hl_df = pd.DataFrame(hl_stocks, columns = ['Volume','Open'])
# Check the attributes without missing values
hl_df.isna().sum()

## Data Plotting

# Step B | Feature Scaling

In [None]:
# Import StandardScaler class from sklearn.preprocessing library
from sklearn.preprocessing import StandardScaler

In [None]:
# Save only the features columns in X
X = hl_df
# Save the StandardScaler() class in z 
z = StandardScaler()
# Selecting the feature attributes of X
# Make fitting, that means searching the mean and std
# And transform deeveloping the standardization formula
X = z.fit_transform(X)
# Save the values in the Volume and Open attributes inner the df
X_df = pd.DataFrame(X, columns = ['Volume','Open'])

# Step C | Gaussian Mixture Clustering 

In [None]:
# Import KMeans class from sklearn.cluster library
from sklearn.cluster import KMeans
# Import GaussianMixture class from sklearn.mixture library
from sklearn.mixture import GaussianMixture

In [None]:
# Save the GaussianMixture class with three cluster
EM = GaussianMixture(n_components = 3)
# Fit the X dataset in the EM
EM.fit(X_df)
# Save the predict of the EM in cluster points (0 to 2)
cluster = EM.predict(X_df)
cluster

In [None]:
# Probability to enter in a cluster
cluster_p = EM.predict_proba(X_df)
cluster_p

## Silhouette Coefficient

A metric used to calculate the **goodness** of a clustering technique. 

Its value ranges from **-1 to 1**.

In [None]:
# Import the silhouette_score class from sklearn.metrics library
from sklearn.metrics import silhouette_score

In [None]:
# Print the Silhouette Coefficient using the silhouette_score class
# Parameters are the dataset and the cluster viewed previously
print('Silhouette Coefficient:', silhouette_score(X_df, cluster))

## Cluster Plotting

In [None]:
# Save the cluster in a new cluster attributte in X
# This step works with datframes
X_df['Cluster'] = cluster

In [None]:
# Import plotnine library for grammar graphics
from plotnine import *
# To manage graphs without windows
%matplotlib inline

In [None]:
# Plot using X_df dataframe with Volume as x-axis, Open as y-axis
(ggplot(X_df, aes(x='Volume', y='Open', color='Cluster')) + geom_point())