In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, classification_report


%matplotlib inline
plt.style.use('ggplot')

from utils.clean_utils import reduce_dataframe, clean_dataframe
from utils.model import model_RandomClass

In [3]:
df_clean = pd.read_csv('data/feats_cleaned.csv')
df_reduce = pd.read_csv('data/feats_reduced_byRF.csv')

# Logistic Regression using the Reduced DF

In [4]:
columns_R = df_reduce.columns
feat_cols_R = []
for name in columns_R:
    if name != "structureProteinName" and name != "cellID" and name != "save_feats_path":
        feat_cols_R.append(name)

In [5]:
# Split to features and labels
X_R = df_reduce[feat_cols_R]
y_R = df_reduce.structureProteinName

In [6]:
X_train_R, X_test_R, y_train_R, y_test_R = train_test_split(X_R, y_R, random_state=10)

In [8]:
model_R = LogisticRegression(penalty='l1')

In [9]:
# model_R.fit(X_train_R, y_train_R)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
#joblib.dump(model_R, 'models/logregL1_reduced.pkl') 

['models/logregL1_reduced.pkl']

In [None]:
model_R = joblib.load('models/logregL1_reduced.pkl')

In [11]:
# Read in an L2 model from a different session
model_R_l2 = joblib.load('models/logregL2_reduced.pkl')

In [14]:
# L1 LogReg
print(classification_report(y_true=y_test_R, y_pred=model_R.predict(X_test_R)))

               precision    recall  f1-score   support

Alpha actinin       0.21      0.31      0.25       127
Alpha tubulin       0.32      0.31      0.32       852
   Beta actin       0.31      0.25      0.27       414
  Desmoplakin       0.27      0.25      0.26       605
  Fibrillarin       0.23      0.24      0.23       234
     Lamin B1       0.38      0.48      0.42      1105
   Myosin IIB       0.07      0.08      0.07        49
      ST6GAL1       0.35      0.28      0.31       402
   Sec61 beta       0.26      0.14      0.18       499
        Tom20       0.34      0.36      0.35      1102
          ZO1       0.21      0.22      0.21        60

  avg / total       0.32      0.32      0.31      5449



In [15]:
# L2 LogReg
print(classification_report(y_true=y_test_R, y_pred=model_R_l2.predict(X_test_R)))

               precision    recall  f1-score   support

Alpha actinin       0.25      0.28      0.27       127
Alpha tubulin       0.31      0.31      0.31       852
   Beta actin       0.29      0.22      0.25       414
  Desmoplakin       0.24      0.20      0.22       605
  Fibrillarin       0.21      0.21      0.21       234
     Lamin B1       0.35      0.48      0.41      1105
   Myosin IIB       0.12      0.12      0.12        49
      ST6GAL1       0.33      0.22      0.27       402
   Sec61 beta       0.24      0.12      0.16       499
        Tom20       0.32      0.36      0.34      1102
          ZO1       0.25      0.30      0.27        60

  avg / total       0.30      0.31      0.30      5449



In [17]:
# Compare to baseline of random guessing based on class distributions
print(classification_report(y_true=y_test_R, y_pred=model_RandomClass(y_test_R)))

               precision    recall  f1-score   support

Alpha actinin       0.04      0.04      0.04       127
Alpha tubulin       0.16      0.16      0.16       852
   Beta actin       0.06      0.06      0.06       414
  Desmoplakin       0.11      0.11      0.11       605
  Fibrillarin       0.04      0.04      0.04       234
     Lamin B1       0.19      0.20      0.20      1105
   Myosin IIB       0.00      0.00      0.00        49
      ST6GAL1       0.06      0.06      0.06       402
   Sec61 beta       0.10      0.10      0.10       499
        Tom20       0.22      0.21      0.21      1102
          ZO1       0.00      0.00      0.00        60

  avg / total       0.14      0.14      0.14      5449

