In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, classification_report, log_loss


%matplotlib inline

from utils.clean_utils import reduce_dataframe
from utils.model import model_RandomClass

# Read Data

In [23]:
df_raw = pd.read_csv('data/feats_out.csv')

# Clean Data

In [24]:
columns_raw = df_raw.columns

In [25]:
drop_columns = []
for name in columns_raw:
    if 'feat_cell' in name or 'feat_nuc' in name or\
    name == 'structureProteinName' or name == 'inputFolder':
        continue
    else:
        drop_columns.append(name)

In [26]:
df = df_raw.drop(drop_columns, axis=1)

In [27]:
# Remove rows that have NaNs in all columns, using first column as indicator.
df = df[np.isfinite(df['feat_nuc_region_mean_px'])]

# Replace NaNs in specific columns with the column mean.
df['feat_nuc_obj_mean_edge_len'].fillna((df['feat_nuc_obj_mean_edge_len'].mean()), inplace=True)
df['feat_nuc_obj_std_edge_len'].fillna((df['feat_nuc_obj_std_edge_len'].mean()), inplace=True)

In [28]:
columns = df.columns

In [30]:
# Check that all NaNs are replaced
for name in columns:
    if df[name].isnull().sum() > 0:
        print name, df[name].isnull().sum()

# Prepare Data

In [31]:
feat_cols = []
for name in columns:
    if name != "structureProteinName" and name != "inputFolder":
        feat_cols.append(name)

In [36]:
label = []
for item in df.inputFolder:
    label.append(item[13:18])

In [42]:
set(label)

{'assay', 'micro'}

In [55]:
Counter(label)

Counter({'assay': 4424, 'micro': 17369})

In [58]:
print("Assay", 4424.0/len(label))
print("Micro", 17369.0/len(label))

('Assay', 0.20300096361216904)
('Micro', 0.796999036387831)


In [46]:
# Split to features and labels
X_ = df[feat_cols]
y = pd.Series(label)

In [47]:
# Normalize so coefficients can be compared
min_max_scaler = MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X_)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = feat_cols

In [48]:
X = df_normalized

# Train

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [50]:
model_logreg_operator = LogisticRegression(penalty='l2', class_weight='balanced')

In [51]:
model_logreg_operator.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

# Test

In [54]:
print("LOGISTIC REGRESSION")
print(classification_report(y_true=y_test, y_pred=model_logreg_operator.predict(X_test)))
print("Mean Accuracy: ", model_logreg_operator.score(X_test, y_test))
print("Log Loss: ", log_loss(y_true=y_test, y_pred=model_logreg_operator.predict_proba(X_test)))

LOGISTIC REGRESSION
             precision    recall  f1-score   support

      assay       0.79      0.91      0.85      1119
      micro       0.98      0.94      0.96      4330

avg / total       0.94      0.93      0.93      5449

('Mean Accuracy: ', 0.93191411268122593)
('Log Loss: ', 0.17164517873200533)


In [62]:
# Compare to baseline of random guessing based on class distributions
print("RANDOM GUESSING")
print(classification_report(y_true=y_test, y_pred=model_RandomClass(y_test)))

RANDOM GUESSING
             precision    recall  f1-score   support

      assay       0.19      0.19      0.19      1119
      micro       0.79      0.80      0.79      4330

avg / total       0.67      0.67      0.67      5449

