# Métodos de agrupamiento para Genes de cancer (Bionformática)

In [1]:
# Data structure and plots
import numpy as np
import pandas as pd
import time

# Preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA  # Dimensionality reduction

# GRaphics
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.rcParams["figure.figsize"] = (12, 12)

# Import Dataset Function

In [2]:
def importdata():
    df = pd.read_csv(
"../Datasets/TCGA-PANCAN-Hiseq-198x200(801x20531).csv",
    index_col=0)
    # Printing the dataswet shape
    print ("Dataset Length: ", len(df))
    print ("Dataset Shape: ", df.shape)
    # Printing the dataset obseravtions
    print ("Dataset: \n", df.head())
#    print ("Columns: ", df.columns[0:200])
    return df

In [3]:
def cal_accuracy(i, y_test, y_pred):
    
    cm = confusion_matrix(y_test, y_pred)
    #print("Confusion Matrix: \n", cm)
    import seaborn as sns
    plt.figure(i)
    plt.plot([1,1])
    sns.heatmap(cm, annot=True)
    plt.show()
    print ("Accuracy : \n",
    accuracy_score(y_test,y_pred)*100)
    
    print("Report : \n",
    classification_report(y_test, y_pred))

# Function to split the dataset

In [4]:
def splitdataset(balance_data):

    # Separating the target variable
    X = balance_data.values[:, 1:200]
    Y = balance_data.values[:, 201]
    Y = Y.astype('str')
    le = preprocessing.LabelEncoder()
    le.fit(Y)
    print("clases: \n", le.classes_)
    classes = le.classes_
#    Yn = le.transform(Y)
#    print("Transformación ", Yn)

    print("Dataset using: ", np.shape(X))
    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test, = train_test_split(
    X, Y, test_size = 0.3, random_state = 100)
    
    return X, Y, X_train, X_test, y_train, y_test, classes  

# Main Code

## Preprocessing and 

In [5]:
print("-------------------------------------------------------------------------\n")
print("-------------------------------------------------------------------------\n")
print("Data form csv")
data = importdata()

X, Y, X_train, X_test, y_train, y_test, classes = splitdataset(data)
    
#feature Scaling      
#st_x= StandardScaler()    
#x_train= st_x.fit_transform(X_train)    
#x_test= st_x.transform(X_test)

-------------------------------------------------------------------------

-------------------------------------------------------------------------

Data form csv


FileNotFoundError: [Errno 2] No such file or directory: '../Datasets/TCGA-PANCAN-Hiseq-198x200(801x20531).csv'

In [None]:
classes,Yn_train=np.unique(y_train,return_inverse=True)
print(y_train)
print(classes)
print(Yn_train)

In [None]:
C=12
pca = PCA(n_components=C).fit(X_train)
plt.figure(1)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
X_pca=pca.fit_transform(X_train)

## PCA with 3 components 

In [None]:
# Creating figure
fig = plt.figure(2)
ax = plt.axes(projection ="3d")
# Creating plot
colores=['blue','red','green','black','cyan']
assigned=[]
for row in Yn_train:
    assigned.append(colores[row])
print(X_pca.shape)
ax.scatter3D(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=assigned,s=60)
plt.title("simple 3D scatter plot with 5 cancer classes")
# show plot
plt.show()

In [None]:
Nc = range(1, 10)
kmeans = [KMeans(n_clusters=i) for i in Nc]
#kmeansCen = [kmeans[i].fit(X_pca) for i in range(len(kmeans))]
score = [kmeans[i].fit(X_pca).score(X_pca) for i in range(len(kmeans))]

In [None]:
plt.plot(Nc,score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()

In [None]:
# Predicting the clusters
k_labels = kmeans[4].predict(X_pca)

# Creating figure
fig = plt.figure()
ax = plt.axes(projection ="3d")
 
# Creating plot
colores=['blue','red','green','black','cyan','yellow','orange']
predicted=[]
assigned=[]
for row in k_labels:
    predicted.append(colores[row])
for row in Yn_train:
    assigned.append(colores[row])
ax.scatter3D(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], marker='o',c=assigned,s=100)
ax.scatter3D(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], marker='*',c=predicted,s=20)
plt.title("Prediction with 5 classes with train dataset")
 
# show plot
plt.show()

In [None]:
print(Yn_train)
print(k_labels)

## Reetiquetamos los datos para realizar la matriz de confusión y ver que tan bien se realizó el agrupamiento con respecto a las secuencias correctas

In [None]:
# Prep
k_labels_matched = np.empty_like(k_labels)
nums = np.unique(Yn_train)
# For each cluster label...
for p in np.unique(k_labels):
    match_nums = [np.sum((k_labels==p)*(Yn_train==t)) for t in nums]
    k_labels_matched[k_labels==p] = nums[np.argmax(match_nums)]

In [None]:
print(Yn_train)
print(k_labels_matched)

In [None]:
cal_accuracy("Kmeans", Yn_train, k_labels_matched)

## Test dataset with kmeans

In [None]:
classes,Yn_test=np.unique(y_test,return_inverse=True)
Xt_pca=pca.transform(X_test)
# Predicting the clusters
kt_labels = kmeans[4].predict(Xt_pca)
print(kt_labels)

In [None]:
# Creating figure
fig = plt.figure()
ax = plt.axes(projection ="3d")
 
# Creating plot
colores=['blue','red','cyan','yellow','orange']
predicted=[]
assigned=[]
for row in kt_labels:
    predicted.append(colores[row])
for row in Yn_test:
    assigned.append(colores[row])
ax.scatter3D(Xt_pca[:, 0], Xt_pca[:, 1], Xt_pca[:, 2], marker='o',c=assigned,s=150)
ax.scatter3D(Xt_pca[:, 0], Xt_pca[:, 1], Xt_pca[:, 2], marker='+',c=predicted,s=150)
plt.title("Prediction with 5 classes with train dataset")
 
# show plot
plt.show()

In [None]:
# Prep
kt_labels_matched = np.empty_like(kt_labels)
nums = np.unique(Yn_test)
# For each cluster label...
for p in np.unique(k_labels):
    match_nums = [np.sum((kt_labels==p)*(Yn_test==t)) for t in nums]
    print(match_nums)
    kt_labels_matched[kt_labels==p] = nums[np.argmax(match_nums)]
print(kt_labels)
print(Yn_test)
print(kt_labels_matched)
cal_accuracy("KmeansTest", Yn_test, kt_labels_matched)

In [None]:
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X_train, method  = "ward"))
plt.title('Dendrogram')
plt.xlabel('Sequences')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
# Save fuction
#import pickle
#pickle.dump(kMeans, open("kmeans.pkl", 'wb'))