# Classification with flattened images and linear regression

In [1]:
import pandas as pd
import numpy as np
from ccc_nn_functions import str2array

csv_file = r'C:\Users\rz200\Documents\development\cell-SCT\classification\imported_CSV\dataframe_822'
df = pd.read_csv(csv_file)

In [2]:
def df_ignore_rows(df):
    indices_to_skip_img_wrong_shape = [i for i in range(len(df)) if str2array(df['pcna_crops'][i]).dtype is np.dtype('object')]  # skipping rows with shapes such as (7,)
    indices_to_skip_no_class = df[(df['G1_Phase'] == False) & (df['S_Phase'] == False) & (df['G2_M_Phase'] == False)].index

    rows_to_ignore = np.concatenate((indices_to_skip_img_wrong_shape, indices_to_skip_no_class), axis=0)
    df = df.drop(set(rows_to_ignore)).reset_index(drop=True) #dropping the rows to ignore

    return df

df = df_ignore_rows(df)

  return np.array(ast.literal_eval(s))


In [4]:
def get_crops_flat_pad(df,column_name):
    pcna_crops = []
    for i in range(len(df)): pcna_crops.append(str2array(df[column_name][i]))
    #We'll want to flatten all of these arrays
    pcna_crops_flat = []
    for i in range(len(pcna_crops)):
        pcna_crops_flat.append(pcna_crops[i].flatten())
    #Then we want to get the longest one
    max_shape = max([flat_crop.shape[0] for flat_crop in pcna_crops_flat])
    #Then we want to add 0s at the end of everyone that isn't as long as the longest one
    pcna_crops_flat_pad = []
    for i in range(len(pcna_crops_flat)):
        A = pcna_crops_flat[i]
        pad_size = max_shape - A.shape[0]
        new_arr = np.pad(A, (0, pad_size), 'constant')
        pcna_crops_flat_pad.append(new_arr)
    pcna_crops_flat_pad = np.array(pcna_crops_flat_pad)

    return pcna_crops_flat_pad

In [9]:
def get_cell_labels(df):
    g1_indices = df[(df['G1_Phase'] == True)].index
    s_indices = df[(df['S_Phase'] == True)].index
    g2_m_indices = df[(df['G2_M_Phase'] == True)].index

    #make an array that is the length of all of these indices put into one, that is made of 0s
    #replace the 0s accordingly by which phase index they correspond to

    cell_labels = np.arange(len(g1_indices)+len(s_indices)+len(g2_m_indices))

    np.put(cell_labels,g1_indices,np.zeros(len(g1_indices)))
    np.put(cell_labels,s_indices,np.ones(len(s_indices)))
    np.put(cell_labels,g2_m_indices,np.full(len(g2_m_indices),2))

    return cell_labels

cell_labels = get_cell_labels(df)

In [5]:
import time

start = time.time()
pcna_crops_flat_pad = get_crops_flat_pad(df,'pcna_crops')
print(time.time()-start)

115.0233781337738


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [6]:
#Get the crops for each channel
dapi_crops_flat_pad = get_crops_flat_pad(df,'dapi_crops')
edu_crops_flat_pad = get_crops_flat_pad(df,'edu_crops')
cyclina2_crops_flat_pad = get_crops_flat_pad(df,'cyclina2_crops')

In [25]:
def train_binary_model(images,labels):
    X_train, X_test, y_train, y_test = train_test_split(images,labels,test_size=0.2,random_state=1)
    reg_model = LinearRegression().fit(X_train,y_train)
    return reg_model

def get_model_accuracy(model,images,labels):
    X_train, X_test, y_train, y_test = train_test_split(images,labels,test_size=0.2,random_state=1)
    predictions = np.round(model.predict(X_test)).astype(int)
    correct = np.count_nonzero(y_test == predictions)
    return correct/len(X_test)

def get_binary_models(images,labels):
    g1_labels = [1 if label==0 else 0 for label in labels]
    s_labels = [1 if label==1 else 0 for label in labels]
    g2_m_labels = [1 if label==2 else 0 for label in labels]

    models = [train_binary_model(images,g1_labels),train_binary_model(images,s_labels),train_binary_model(images,g2_m_labels)]

    accuracies = [get_model_accuracy(models[0],images,g1_labels),get_model_accuracy(models[1],images,s_labels),get_model_accuracy(models[2],images,g2_m_labels)]

    return models, accuracies

In [26]:
print(len(dapi_crops_flat_pad))

79386


In [45]:
dapi_models, dapi_accuracies = get_binary_models(dapi_crops_flat_pad,cell_labels)
print(accuracies)

[0.6999622118654742, 0.7291220556745182, 0.7694923793928706]
[0.6999622118654742, 0.7291220556745182, 0.7694923793928706]


In [42]:
edu_models, edu_accuracies = get_binary_models(edu_crops_flat_pad,cell_labels)
print(accuracies)

[0.6999622118654742, 0.7291220556745182, 0.7694923793928706]


In [43]:
cyclina2_models, cyclina2_accuracies = get_binary_models(cyclina2_crops_flat_pad,cell_labels)
print(accuracies)

[0.6999622118654742, 0.7291220556745182, 0.7694923793928706]


In [44]:
pcna_models, pcna_accuracies = get_binary_models(pcna_crops_flat_pad,cell_labels)
print(accuracies)

[0.6999622118654742, 0.7291220556745182, 0.7694923793928706]


In [49]:
from tabulate import tabulate
table = [['Channel/Phase', 'G1', 'S', 'G2&M'],
         ['DAPI', round(dapi_accuracies[0],2), round(dapi_accuracies[1],2), round(dapi_accuracies[2],2)],
         ['EdU', round(edu_accuracies[0],2), round(edu_accuracies[1],2), round(edu_accuracies[2],2)],
         ['Cyclin A2', round(cyclina2_accuracies[0],2), round(cyclina2_accuracies[1],2), round(cyclina2_accuracies[2],2)],
         ['PCNA', round(pcna_accuracies[0],2), round(pcna_accuracies[1],2), round(pcna_accuracies[2],2)]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒═════════════════╤══════╤══════╤════════╕
│ Channel/Phase   │   G1 │    S │   G2&M │
╞═════════════════╪══════╪══════╪════════╡
│ DAPI            │ 0.67 │ 0.66 │   0.77 │
├─────────────────┼──────┼──────┼────────┤
│ EdU             │ 0.68 │ 0.84 │   0.77 │
├─────────────────┼──────┼──────┼────────┤
│ Cyclin A2       │ 0.7  │ 0.71 │   0.77 │
├─────────────────┼──────┼──────┼────────┤
│ PCNA            │ 0.7  │ 0.73 │   0.77 │
╘═════════════════╧══════╧══════╧════════╛
