In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.externals import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss

import xgboost as xgb

%matplotlib inline
plt.style.use('ggplot')

#from utils.clean_utils import reduce_dataframe, clean_dataframe
#from utils.model import model_RandomClass

In [2]:
df = pd.read_csv('data/feats_cleaned.csv')

In [3]:
columns = df.columns
feat_cols = []
for name in columns:
    if name != "structureProteinName" and name != "cellID" and name != "save_feats_path":
        feat_cols.append(name)

In [4]:
# Split to features and labels
X_temp = df[feat_cols]
y = df.structureProteinName

In [5]:
# Normalize so coefficients can be compared
min_max_scaler = MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X_temp)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = feat_cols

In [6]:
# Reset X to normalized features
X = df_normalized

In [7]:
# Test Train Split stratified so classes are balanced in split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=.2, stratify=y)

# Logistic Regression

Ridge regression, stratified kfold is the default for cross val score 

In [10]:
logregl2 = LogisticRegression(penalty='l2', class_weight='balanced')

In [11]:
print(cross_val_score(logregl2, X_train, y_train,scoring='neg_log_loss', cv=3, n_jobs=-1)) 

[-1.89897719 -1.88658122 -1.87964165]


# Random Forest

In [19]:
rf = RandomForestClassifier(n_estimators=1000)

In [20]:
print(cross_val_score(rf, X_train, y_train,scoring='neg_log_loss', cv=3, n_jobs=-1)) 

[-1.87523338 -1.86819802 -1.87435025]


# Gradient Boosting

This is not the final GB model - trying xgboost on an EC2...

In [8]:
gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3)

In [9]:
print(cross_val_score(gbc, X_train, y_train, scoring='neg_log_loss', cv=3, n_jobs=-1)) 

[-1.90803628 -1.89155934 -1.88932612]


## Gradient Boosting Classification GridSearch

In [8]:
parameters = {'learning_rate':[0.0001], 'n_estimators':[1000], 'max_depth':[5,7,10,12,15]}

In [None]:
gbc_gridsearch = GradientBoostingClassifier()
clf = GridSearchCV(gbc_gridsearch, parameters, scoring='neg_log_loss', n_jobs=-1)
clf.fit(X_train, y_train)

In [12]:
clf.best_params_

{'learning_rate': 0.0001, 'max_depth': 10, 'n_estimators': 1000}

In [13]:
clf.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.0001, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [14]:
joblib.dump(clf.best_estimator_, 'models/gbc_gridsearched.pkl') 

['models/gbc_gridsearched.pkl']

In [15]:
clf.best_score_

-2.2657941033217406

# XGBoost

In [9]:
xgbc = xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, n_jobs=-1)

In [None]:
xgbc.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='mlogloss', verbose=True)

[0]	validation_0-mlogloss:2.34271	validation_1-mlogloss:2.34578
[1]	validation_0-mlogloss:2.29516	validation_1-mlogloss:2.3018
[2]	validation_0-mlogloss:2.2534	validation_1-mlogloss:2.26347
[3]	validation_0-mlogloss:2.2162	validation_1-mlogloss:2.22964
[4]	validation_0-mlogloss:2.18306	validation_1-mlogloss:2.19966
[5]	validation_0-mlogloss:2.15286	validation_1-mlogloss:2.17199
[6]	validation_0-mlogloss:2.1252	validation_1-mlogloss:2.14707
[7]	validation_0-mlogloss:2.09993	validation_1-mlogloss:2.12481
[8]	validation_0-mlogloss:2.07667	validation_1-mlogloss:2.10428
[9]	validation_0-mlogloss:2.05513	validation_1-mlogloss:2.08508
[10]	validation_0-mlogloss:2.03481	validation_1-mlogloss:2.06773
[11]	validation_0-mlogloss:2.01623	validation_1-mlogloss:2.05197
[12]	validation_0-mlogloss:1.99862	validation_1-mlogloss:2.0374
[13]	validation_0-mlogloss:1.98187	validation_1-mlogloss:2.02332
[14]	validation_0-mlogloss:1.9661	validation_1-mlogloss:2.01009
[15]	validation_0-mlogloss:1.95163	valida

In [14]:
joblib.dump(xgbc, 'models/gbc_xgb.pkl') 

['models/gbc_xgb.pkl']