# Clustering Retinoblastoma Cells

In [None]:
#import library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

## Data Extraction

In [None]:
cell = pd.read_csv('../input/y79-retinoblastoma-cells/Y79_data.csv')

In [None]:
cell.shape

In [None]:
cell.head()

In [None]:
cell.describe()

In [None]:
cell.info()

In [None]:
#handling missing value
cell.isnull().sum()

In [None]:
#heatmap correlation
plt.figure(figsize = (10,6))
sns.heatmap(cell.corr(), annot = True, cmap = 'OrRd')

## Selection Data

In [None]:
df = cell[['diameter', 'Vmb']]
df.head()

## K-Means Clustering Model

In [None]:
#split data
X = df.drop('Vmb', axis = 1)
y = df['Vmb']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
kmc = KMeans()
kmc.fit(X_train, y_train)
print(kmc)

In [None]:
#prediction
y_pred = kmc.predict(X_test)
print(y_pred)

In [None]:
#check cluster
kmeans = KMeans(n_clusters = 12).fit(df)
result = kmeans.cluster_centers_
print(result)

In [None]:
#plotting model
plt.figure(figsize=(10,6))
plt.scatter(df['Vmb'], df['diameter'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(result[:, 0], result[:, 1], c='red', s=50)
plt.title("Vmb ~ Diameter")
plt.xlabel("Vmb")
plt.ylabel("Diameter")
plt.show()

In [None]:
#distribution
plt.figure(figsize = (10,6))
sns.distplot(df['Vmb'])
plt.title("Distribution of Vmb")
plt.show()