In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.neighbors import kneighbors_graph

### Please Install Plotly on your machine to see plots in notebooks.

In [None]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,cov1,cov2,cov3,cov4,cov5,cov6,cov7,sal_pur_rat,igst_itc_tot_itc_rat,lib_igst_itc_rat
0,0.997797,0.999888,0.215934,0.196713,0.0,0.955616,0.99881,-0.032581,1.761759,-0.054329
1,0.994004,0.979902,-0.337135,-0.248634,0.0,0.640812,0.553918,-0.032026,-0.629311,-0.053516
2,0.947603,0.455667,0.001743,0.12861,-0.004054,-0.162069,0.960601,-0.030209,1.535697,-0.054215
3,0.396577,0.919933,0.496451,0.576824,-0.340718,0.802363,0.67371,-0.032058,0.44916,-0.054126
4,0.999893,0.327615,0.700477,0.315601,0.0,0.300785,0.979009,-0.032224,1.762049,-0.05433


In [None]:
data.shape

(1199, 10)

In [None]:
X = data.values

## Exploratory Data Analysis

In [None]:
!pip3 install pandas_profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report", progress_bar=False)
profile.to_file("data_analysis.html")
profile

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas_profiling
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ydata-profiling
  Downloading ydata_profiling-4.1.2-py2.py3-none-any.whl (345 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.9/345.9 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multimethod<1.10,>=1.4
  Downloading multimethod-1.9.1-py3-none-any.whl (10 kB)
Collecting htmlmin==0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tqdm<4.65,>=4.48.2
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typeguard<2.14,>=2.13

  from pandas_profiling import ProfileReport




## Visualize entire data with t-SNE

In [None]:
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=500, random_state=23)
new_values = tsne_model.fit_transform(X)

In [None]:
def plot_data(new_values, labels):
    output = pd.DataFrame(new_values, columns=['x', 'y', 'z'])
    output['class'] = labels#.astype(np.int)
    fig = px.scatter_3d(output, x='x', y='y', z='z', opacity=1.0, color='class')
    fig.update_traces(marker=dict(size=2), selector=dict(mode='markers'))
    fig.update_layout(margin={'l': 0, 'r': 0, 'b': 100, 't': 0}, width=800, height=600)
    fig.show()

In [None]:
plot_data(new_values, np.ones(len(new_values)))

## Kmeans Clustering

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42).fit(X)
plot_data(new_values, kmeans.labels_)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42).fit(X)
plot_data(new_values, kmeans.labels_)

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42).fit(X)
plot_data(new_values, kmeans.labels_)