In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from matplotlib.lines import Line2D

from sklearn.externals import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, classification_report, roc_auc_score


%matplotlib inline
plt.style.use('ggplot')

# Clean

In [9]:
df = pd.read_csv('data/feats_cleaned.csv')

In [10]:
columns = df.columns
feat_cols = []
for name in columns:
    if name != "structureProteinName" and name != "cellID"\
    and name != "save_feats_path" and name != "protein_label":
        feat_cols.append(name)

# Create labels

In [11]:
y = df.structureProteinName
y_factorize = y.factorize()

In [12]:
labels = y_factorize[1]
labels

Index(['Sec61 beta', 'Alpha tubulin', 'Fibrillarin', 'Desmoplakin', 'Lamin B1',
       'Myosin IIB', 'Beta actin', 'Tom20', 'Alpha actinin', 'ZO1', 'ST6GAL1'],
      dtype='object')

# Models: Each trained on each label

In [13]:
for i in range(len(labels)):
    
    # Binarize labels
    df['protein_label'] = np.where(df['structureProteinName']==labels[i],1,0)
    
    # Split to features and labels
    X_temp = df[feat_cols]
    y = df.protein_label

    # Normalize so coefficients can be compared
    min_max_scaler = MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X_temp)
    df_normalized = pd.DataFrame(np_scaled)
    df_normalized.columns = feat_cols

    # Reset X to normalized features
    X = df_normalized

    # Test Train Split stratified so classes are balanced in split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=.2, stratify=y)

    # Fit logistic regression with l2 regularization
    logregl2 = LogisticRegression(penalty='l2')
    logregl2.fit(X_train, y_train)

    # Print label, classification report, and accuracy
    print(str(labels[i]))
    print(classification_report(y_true=y_test, y_pred=logregl2.predict(X_test)))
    print("Mean Accuracy: ", logregl2.score(X_test, y_test))
    
    # Look at coefficients
    coef_df = pd.DataFrame(np.transpose(logregl2.coef_))
    coefficients = pd.concat([pd.DataFrame(X.columns),coef_df], axis = 1)
    coefficients.columns = ['feature','coef_val']
    coefficients['coef_val_abs'] = coefficients.coef_val.abs()
    print(coefficients.sort_values('coef_val_abs',ascending=False)[0:10])
    print('\n\n')

Sec61 beta
             precision    recall  f1-score   support

          0       0.91      0.99      0.95      3973
          1       0.25      0.02      0.04       386

avg / total       0.85      0.91      0.87      4359

Mean Accuracy:  0.9077770130763937
                              feature  coef_val  coef_val_abs
1544          feat_cell_mt_distHist_3 -2.173125      2.173125
807               feat_cell_mt_edge_1  2.129585      2.129585
1543          feat_cell_mt_distHist_2 -1.766818      1.766818
778       feat_nuc_mt_region_hist_629 -1.764966      1.764966
33         feat_cell_region_median_px  1.673083      1.673083
458       feat_nuc_mt_region_hist_309  1.649154      1.649154
829   feat_cell_mt_edge_region_hist_2  1.648220      1.648220
1170     feat_cell_mt_region_hist_279 -1.542518      1.542518
1516     feat_cell_mt_region_hist_625  1.495484      1.495484
762       feat_nuc_mt_region_hist_613  1.455627      1.455627



Alpha tubulin
             precision    recall  f1-sco

  'precision', 'predicted', average, warn_for)


Beta actin
             precision    recall  f1-score   support

          0       0.93      0.99      0.96      4038
          1       0.44      0.11      0.18       321

avg / total       0.90      0.92      0.90      4359

Mean Accuracy:  0.9240651525579261
                              feature  coef_val  coef_val_abs
1632         feat_cell_obj_tot_height  3.118232      3.118232
6          feat_nuc_region_entropy_px  2.579308      2.579308
39    feat_cell_region_histogram_px_1 -1.852952      1.852952
834   feat_cell_mt_edge_region_hist_7 -1.801961      1.801961
69                 feat_nuc_mt_edge_5  1.787442      1.787442
1196     feat_cell_mt_region_hist_305 -1.610669      1.610669
913       feat_cell_mt_region_hist_22 -1.608959      1.608959
164        feat_nuc_mt_region_hist_15  1.536551      1.536551
1028     feat_cell_mt_region_hist_137 -1.446166      1.446166
903       feat_cell_mt_region_hist_12 -1.350788      1.350788



Tom20
             precision    recall  f1-score   sup