In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.externals import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, classification_report, roc_auc_score


%matplotlib inline
plt.style.use('ggplot')

# Clean

In [2]:
df = pd.read_csv('data/feats_cleaned.csv')

In [3]:
columns = df.columns
feat_cols = []
for name in columns:
    if name != "structureProteinName" and name != "cellID" and name != "save_feats_path":
        feat_cols.append(name)

In [4]:
# Split to features and labels
X_temp = df[feat_cols]
y = df.structureProteinName

In [5]:
# Normalize so coefficients can be compared
min_max_scaler = MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X_temp)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = feat_cols

In [6]:
# Reset X to normalized features
X = df_normalized

In [7]:
# Test Train Split stratified so classes are balanced in split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=.2, stratify=y)

# Load Models

### L2 LogReg on all Features - w/ balanced class weights

logregl2 = LogisticRegression(penalty='l2', class_weight='balanced')

In [8]:
model_logreg_l2 =joblib.load('models/logregL2_norm_balance_ec2.pkl')

### Random Forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False

In [9]:
model_rf = joblib.load('models/rf_gridsearched.pkl')

### Gradient Boosting (sklearn)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.0001, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [10]:
model_gbc_sklearn = joblib.load('models/gbc_gridsearched.pkl') 

### Gradient Boosting (xgboost)

xgbc = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, n_jobs=-1){learning_rate=0.1, n_estimators=1000, max_depth=3}

In [11]:
model_gbc_xgboost = joblib.load('models/gbc_xgb.pkl') 

In [13]:
print(classification_report(y_true=y_test, y_pred=model_logreg_l2.predict(X_test)))

               precision    recall  f1-score   support

Alpha actinin       0.40      0.87      0.54        98
Alpha tubulin       0.46      0.35      0.39       695
   Beta actin       0.41      0.52      0.46       321
  Desmoplakin       0.43      0.42      0.42       512
  Fibrillarin       0.37      0.66      0.48       198
     Lamin B1       0.54      0.47      0.51       893
   Myosin IIB       0.37      0.68      0.48        31
      ST6GAL1       0.39      0.55      0.46       309
   Sec61 beta       0.36      0.29      0.32       386
        Tom20       0.47      0.38      0.42       869
          ZO1       0.44      0.85      0.58        47

  avg / total       0.45      0.44      0.44      4359

