In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.externals import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score


%matplotlib inline
plt.style.use('ggplot')

#from utils.clean_utils import reduce_dataframe, clean_dataframe
#from utils.model import model_RandomClass

In [2]:
df = pd.read_csv('data/feats_cleaned.csv')

In [3]:
columns = df.columns
feat_cols = []
for name in columns:
    if name != "structureProteinName" and name != "cellID" and name != "save_feats_path":
        feat_cols.append(name)

In [4]:
# Split to features and labels
X_temp = df[feat_cols]
y = df.structureProteinName

In [5]:
# Normalize so coefficients can be compared
min_max_scaler = MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X_temp)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = feat_cols

In [6]:
# Reset X to normalized features
X = df_normalized

In [7]:
# Test Train Split stratified so classes are balanced in split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=.2, stratify=y)

# Logistic Regression

Ridge regression, stratified kfold is the default for cross val score 

In [10]:
logregl2 = LogisticRegression(penalty='l2', class_weight='balanced')

In [11]:
print(cross_val_score(logregl2, X_train, y_train,scoring='neg_log_loss', cv=3, n_jobs=-1)) 

[-1.89897719 -1.88658122 -1.87964165]


# Random Forest

In [19]:
rf = RandomForestClassifier(n_estimators=1000)

In [20]:
print(cross_val_score(rf, X_train, y_train,scoring='neg_log_loss', cv=3, n_jobs=-1)) 

[-1.87523338 -1.86819802 -1.87435025]


# Gradient Boosting

This is not the final GB model - trying xgboost on an EC2...

In [8]:
gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3)

In [9]:
print(cross_val_score(gbc, X_train, y_train, scoring='neg_log_loss', cv=3, n_jobs=-1)) 

[-1.90803628 -1.89155934 -1.88932612]
