In [7]:
import sys
sys.path.append('../scripts')
from course_utils import trainTest
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from models import evaluate_accuracy
import models
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
import pickler
import extract_metadata
import join_data as jd

In [78]:
case_data_dir = '../data'
CASE_DATA_FILENAME = 'merged_caselevel_data.csv'
cases_df = extract_metadata.extract_metadata(case_data_dir+'/'+CASE_DATA_FILENAME)
num_shards = 1340
X, case_ids, y=jd.load_data(case_data_dir+'/feature_matrix_100.svmlight',
              case_data_dir+'/case_ids.p',
              cases_df, 
              case_data_dir+'/docvec_text',
              num_opinion_shards=num_shards,
              min_required_count=100)

Loading data from ../data/feature_matrix_100.svmlight and ../data/case_ids.p


##TODO: 
- try TF-IDF True or false

In [79]:
def subsample(X,y,case_ids, sample_pct):
    '''
    Take a random sub-sample of the rows in your data set.  NOTE: to keep it predictable, this is seeded.
    Args:
        X: feature matrix
        y: label array
        case_ids: list of case ids
        sample_pct: float between 0 and 1, determines what fraction of the data you want to keep. 
        
    Returns X,y, and case_ids, but filtered down to a random sample
    '''
    case_ids2 = np.array(case_ids)
    assert X.shape[0]==len(y), "X and y are not the same length"
    assert len(case_ids2)==len(y), "case_ids and y are not the same length"
    sample_size = int(sample_pct*len(y))
    np.random.seed(10)
    
    #Get random sub-sample of row indexes
    sample_indexes = sorted(np.random.choice(range(len(y)), size=sample_size,replace=False))
    return X[sample_indexes],y[sample_indexes],list(case_ids2[sample_indexes])
    
X,y,case_ids = subsample(X,y,case_ids,0.1)

In [80]:
def train_test_split(X,y, ordered_case_ids,pct_train):
    train_rows = int(pct_train*len(y))
    y_train = np.array(y[:train_rows])
    y_test = np.array(y[train_rows:])
    X_train = X[:train_rows]
    X_test = X[train_rows:]
    case_ids_train = ordered_case_ids[:train_rows]
    case_ids_test = ordered_case_ids[train_rows:]
    return X_train,y_train,case_ids_train,X_test,y_test,case_ids_test

X_train,y_train,case_ids_train,X_test,y_test,case_ids_test = train_test_split(X,y,case_ids,0.75)

In [81]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(1159, 23534)
(387, 23534)
(1159,)
(387,)


##Baseline Model
The base line majority classifier model to beat is 54% accuracy

In [98]:
reload(models)
mclf = models.MajorityClassifier()
mclf.fit(X_train,y_train)

evaluate_accuracy(y_test,mclf.predict(X_test))

	 	 pred
true 	 	 1 	 2 	 3
	 1 	 214 	 0 	 0
	 2 	 62 	 0 	 0
	 3 	 111 	 0 	 0


0.55297157622739013

##Out-of-the-Box SVM
Gets accuracy of 39%

In [68]:
clf = LinearSVC(penalty='l1',random_state=0, dual=False)
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=0, tol=0.0001,
     verbose=0)

In [69]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [70]:
clf.score(X_test,y_test)

0.56581329195758989

In [71]:
evaluate_accuracy(y_train,y_train_pred)

	 	 pred
true 	 	 1 	 2 	 3
	 1 	 5061 	 89 	 501
	 2 	 329 	 1253 	 258
	 3 	 682 	 97 	 3329


0.83136477282524357

In [72]:
evaluate_accuracy(y_test,y_test_pred)

	 	 pred
true 	 	 1 	 2 	 3
	 1 	 1643 	 106 	 405
	 2 	 337 	 134 	 156
	 3 	 579 	 96 	 411


0.56581329195758989

##Optimized SVM
Accuracy is truly horrendous: 40%

In [82]:
def optimizeSVM(X_train, y_train, reg_min_log10=-2, reg_max_log10=2, regularization_type='l1'):
    '''
    Creates an SVM classifier trained on the given data with an optimized C parameter.
    Args:
      X_train: A dataframe on which to train the features
      y_train: A dataframe on which to evaluate the training data
      reg_min_log10: log base 10 of the low end of the regularization parameter range.  -2 means 10^-2
      reg_max_log10: log base 10 of the high end of the regularization parameter range.  2 means 10^2
    Returns:
      A fitted SVM classifier.
    '''
    
    model_to_set = LinearSVC(penalty=regularization_type,random_state=0, dual=False)
    # consider broadening the param_grid to include different SVM kernels and degrees.  See:
    # http://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier
    #param_grid = {'C': [10**i for i in range(-reg_min_log10,reg_max_log10)] + [1e30]}
    param_grid = {'C':[1]}
    model_tuning = GridSearchCV(model_to_set, scoring='f1_weighted',param_grid=param_grid)
    
    model_tuning.fit(X_train, y_train)
    print 'best C param for SVM classifier:', model_tuning.best_params_['C']
    print 'best_score: ', model_tuning.best_score_
        
    return model_tuning.best_estimator_

In [86]:
svm_opt = optimizeSVM(X_train,y_train,-4,4)

best C param for SVM classifier: 1
best_score:  0.476915211796


In [87]:
svm_opt.score(X_test,y_test)

0.55038759689922478

In [88]:
evaluate_accuracy(svm_opt.predict(X_test),y_test)

	 	 pred
true 	 	 1 	 2 	 3
	 1 	 175 	 51 	 72
	 2 	 5 	 0 	 1
	 3 	 34 	 11 	 38


0.55038759689922478

## Optimized Logistic

In [89]:
def optimizeLogistic(X_train, y_train, reg_min_log10=-2, reg_max_log10=2,regularization_type='l1'):
    '''
    Creates an SVM classifier trained on the given data with an optimized C parameter.
    Args:
      X_train: A dataframe on which to train the features
      y_train: A dataframe on which to evaluate the training data
      score_func: Scoring function.  Dizzying options here.  Consider:
          metrics.accuracy_score
          metrics.f1_score
    Returns:
      A fitted SVM classifier.
    '''
    
    model_to_set = LogisticRegression(penalty=regularization_type)
    param_grid = {'C': [10**i for i in range(-reg_min_log10,reg_max_log10)] + [1e30]}
    model_tuning = GridSearchCV(model_to_set, param_grid=param_grid,
                             scoring='f1_weighted')
    
    model_tuning.fit(X_train, y_train)
    print 'best C param for LR classifier:', model_tuning.best_params_['C']
    print 'best params: ', model_tuning.best_params_
    print 'best_score: ', model_tuning.best_score_
        
    return model_tuning.best_estimator_

In [95]:
logit_opt = optimizeLogistic(X_train,y_train,reg_min_log10=-4, reg_max_log10=7)

best C param for LR classifier: 1e+30
best params:  {'C': 1e+30}
best_score:  0.47046562865


In [96]:
logit_opt.score(X_test,y_test)

0.47545219638242892

In [97]:
evaluate_accuracy(logit_opt.predict(X_test),y_test)

	 	 pred
true 	 	 1 	 2 	 3
	 1 	 143 	 35 	 73
	 2 	 23 	 12 	 9
	 3 	 48 	 15 	 29


0.47545219638242892