In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy
import os
import scipy

from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.classification import classification_report

from mseg.common import TRAIN_FILE_DEFAULT, TEST_FILE_DEFAULT, read_file

overwrite_pkl = True

In [4]:
def dissect(data, selection_columns):
    headers = [data[0][i] for i in selection_columns]
    words = [r[5] for r in data]
    samples = [[float(r[i]) for i in selection_columns] for r in data]
    classes = [float(r[6]) for r in data]
    return (headers, words, samples, classes)

# # Utility function to report best scores
# def report(grid_scores, n_top=3):
#     top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
#     for i, score in enumerate(top_scores):
#         print("Model with rank: {0}".format(i + 1))
#         print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
#               score.mean_validation_score,
#               np.std(score.cv_validation_scores)))
#         print("Parameters: {0}".format(score.parameters))
#         print("")
        
        
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = numpy.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [6]:
    base_dir = "./models"
    pm_dir = "prosodic_models"
    tr_file = "/home/rjm49/Dropbox/nlp_alta/recreate_LG/tt_data/eval1-prosodicFeats_norm_train.csv"
    tt_file = "/home/rjm49/Dropbox/nlp_alta/recreate_LG/tt_data/eval1-prosodicFeats_norm_test.csv"
    test_fname = "eval1"
    use_lr = True
    
    n_samples = -1
    cache = 800
    
#     tr_data = read_file(os.path.join(base_dir, tr_file), ',', skip_header=True)
    tr_data = read_file(tr_file, ',', skip_header=True)
    
    if not use_lr:
        n_samples = 6000
        out_fname = test_fname+"-probabilities.dat"
        report_fname = test_fname+"-report.txt"
    else: 
        out_fname = test_fname+"-probabilities.dat"
        report_fname = test_fname+"-report-LR.txt"

    out_file = os.path.join(base_dir, pm_dir, out_fname)
    report_fname = os.path.join(base_dir, pm_dir, report_fname)
    #clear extant predictions file
    if(os.path.exists(out_file)):
        os.remove(out_file)
        print("removed",out_file)
        
    print(base_dir+"/"+tr_file+" -SVM-> ",out_file)
    

#     test_data = read_file(os.path.join(base_dir, test_fname), ',', skip_header=True)
    test_data = read_file(tt_file, ',', skip_header=True)
    
    #sel = [12,13,14,15,21,22,23,24]
    sel = range(7,30)
    #sel = [8,21,29, 24,25,27]

    (_, _, tr_samples, tr_classes) = dissect(tr_data, sel)
    (_, te_words, te_samples, te_classes) = dissect(test_data, sel)
       
    if n_samples>0:
        tr_samples, _, tr_classes, _ =  train_test_split(tr_samples, tr_classes, train_size=n_samples, stratify=tr_classes) 
    
    p = sum(c==1.0 for c in tr_classes) # count the positive instances
    n = len(tr_classes) - p # derive the negative instances
    print("n=",n," p=",p)
    wgt=float(n)/float(p) # cast and divide
    print("wgt=",wgt)
#     classWeight = { 1: wgt }
    
    
    #tr_samples, te_samples, tr_classes, te_classes = train_test_split(samples, classes, test_size=0.20, random_state=0, stratify=classes)
                    
    tr_samples = numpy.array(tr_samples)
    scaler = preprocessing.StandardScaler()
    tr_samples = scaler.fit_transform(tr_samples)
    
             
    clf = None
    best_params = None
    #override the defaults with the results of a grid search if desired (takes a while)
    
        
    #pickled = False
    pkl_dir = os.path.join(base_dir, pm_dir, "pkl")
    pickled_model = os.path.join(pkl_dir, "svm_classifier.pkl")
    
    if(os.path.exists(pickled_model) and not overwrite_pkl):
        clf = joblib.load(pickled_model)
        clf.set_params(verbose=True)
        print("loaded pickled model...", pickled_model)
    
    else:
        if not os.path.exists(pkl_dir): #output dir doesn't exist so make it
            os.makedirs(pkl_dir)
            print("made dir for pickled model:", pkl_dir)
        
        (cmin, cmax, cstep) = (-5,  17,  2)
        cr = range(cmin,cmax,cstep)
        print(cr)
        #c_range = [ pow(2, y) for y in cr]
        #c_range =(0.005, 0.5, 5, 50, 500, 5000, 50000)
        c_range = (0.5, 50, 5000)
        print('c_range', c_range)
    
        gmin, gmax, gstep = -15, 5, 2
        gr = range(gmin, gmax, gstep)
        print(gr)
        #gamma_range = [ pow(2, y) for y in gr ]
        #gamma_range = (0.00005, 0.0005, 0.005, 0.05, 0.5, 5.0, 50, 500)
        gamma_range = (0.0005, 0.05, 5.0, 500)
        
        print('gamma_range', gamma_range)
        
        c_dist =  scipy.stats.expon(scale=100)
        gamma_dist = scipy.stats.expon(scale=.01)
        
        if use_lr:
            estr = LogisticRegression(class_weight='balanced')
#             estr = LogisticRegression()
            param_dist={'C': c_dist }
        else:
            estr = svm.SVC(kernel='rbf', cache_size=800, probability=True, class_weight='balanced' )
            #estr = svm.LinearSVC(class_weight='balanced')
            param_dist={'C': c_dist , 'gamma': gamma_dist}
            
        
        #searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, cv=5, verbose=True ) #, scoring="recall")
        searcher = RandomizedSearchCV(estr, param_distributions=param_dist, n_iter=100, n_jobs=-1, verbose=True, scoring="recall")
        searcher.fit(tr_samples,tr_classes)
        report(searcher.cv_results_)
        clf = searcher.best_estimator_         

        print("COMPARING CLF PARAMS WITH BEST PARAMS (shd be same)")
        print(clf.get_params())
        print(best_params)
        
        joblib.dump(clf, pickled_model)
        
    print(clf)

#     print("FITTING"     
#     clf.set_params(verbose=True)
#     clf.fit(tr_samples, tr_classes)
#     print(clf
     
    
    #NOW TO TEST AGAINST HELD-OUT/TEST DATA
    te_samples = scaler.transform(te_samples)
    
    print("no test cases", len(te_samples))
    
    predictions = -1.0 * clf.predict_log_proba(te_samples)
#     predictions = clf.predict_proba(te_samples) #this is a list of pairs of probs in form [ [1-p, p],  ... ]
    print(predictions)
    predicted_classes = clf.predict(te_samples)
    
    print("TEST: Number of mislabelled points out of a total %d points : %d" % (len(te_samples),(te_classes != predicted_classes).sum()))
    print(classification_report(te_classes, predicted_classes))

    
    rpt = open(report_fname, "w")
    rpt.write(classification_report(te_classes, predicted_classes))
    rpt.write("\n")
    rpt.close()
    print("wrote report file", rpt)
    
    pred_file = open(out_file,"w")
    pred_file.write("labels 0 1\n") #this emulates an earlier file format for compatibility
    for word, prob_tuple, y_hat, y in zip(te_words,predictions,predicted_classes,te_classes):
        pred_file.write("%d %d %f %f %s\n" % (y_hat, y,prob_tuple[0],prob_tuple[1],word))

    pred_file.close()
    print("wrote predictions file:",pred_file)
    

number of tokens loaded: 61237
./models//home/rjm49/Dropbox/nlp_alta/recreate_LG/tt_data/eval1-prosodicFeats_norm_train.csv -SVM->  ./models/prosodic_models/eval1-probabilities.dat
number of tokens loaded: 6792
n= 59098  p= 2139
wgt= 27.62879850397382
made dir for pickled model: ./models/prosodic_models/pkl
range(-5, 17, 2)
c_range (0.5, 50, 5000)
range(-15, 5, 2)
gamma_range (0.0005, 0.05, 5.0, 500)
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.0min finished


Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 136.05895700345587}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 69.59104863689879}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 305.2655777484975}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 82.21248019937761}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 297.1112638540757}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 181.25598088889748}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 175.17239598565877}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 268.538794820802}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 1.677192674327993}

Model with rank: 1
Mean validation score: 0.746 (std: 0.020)
Parameters: {'C': 19.530034658711042}

Model w

  return np.log(self.predict_proba(X))
