Optimization 
------------
*Basic hyperparameter tuning to improve baseline performance*

*Group Name: Destiny's Child*

Student Names
----

1. Miguel Romero Calvo
2. Jenny Kong
3. Louise Lai

Brief description
---
Decision Trees are used to predict the POS tags for each word in a sentence. 

Load Data
-----

**Load Training Data**

In [1]:
import csv
from nltk import word_tokenize, pos_tag
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestRegressor
import numpy as np

import pprint 
import time

In [3]:
y = []
with open('/Users/siangk/Desktop/msds621/mlproject/ML1_final_project/databunch/train.csv') as train_file:
    y = train_file.read().splitlines()
    
print("There are {} rows of data".format(len(y)))
print("Here is a sample line:\n\t{}".format(y[0]))

There are 35967 rows of data
Here is a sample line:
	"[16, 16, 9, 16, 15, 16, 16, 31, 16, 9, 7, 16, 35, 7, 15, 28, 30, 16, 2]",Joint Chiefs of Staff Chairman Mike Mullen said Sunday that the U.S. has a plan to strike Iran .


**Load Classes**

In [4]:
with open('/Users/siangk/Desktop/msds621/mlproject/ML1_final_project/databunch/classes.txt') as classes_file:
    classes = classes_file.read().splitlines()

print(list(classes))
print(len(classes))

['$', ',', '.', ':', ';', 'CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LRB', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'RRB', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']
41


# Fit scikit-learn model

In [5]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
# pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2)) # example

def transformToTuples(trainFile, classesFile):
    classes # this should exist in global
    taggedSentences = []
    
    # data cleaning
    cleanedTrainFile = trainFile.copy()
    del cleanedTrainFile[7561] # delete weird lines e.g.: [7],The
    del cleanedTrainFile[10422] # [3],...
    
    i = 0
    for i, line in enumerate(cleanedTrainFile):
        splitLine = line.split("\",")
        
        # extract the POS embeddings [0]
        tagsString = splitLine[0].replace("\"", "").replace("[","").replace("]","")
        tags = tagsString.split(", ")
        
        # extract the sentence [1]
        sentence = splitLine[1].strip("\"") # from 2nd elem to second last, to remove the extra "
        
        oneLineTagged = []
        for tag, word in zip(tags, sentence.strip().split(" ")):
            #print(int(tag))
            posTag = classes[int(tag)-1] 
            oneLineTagged.append(('{}'.format(word), '{}'.format(posTag)))
        taggedSentences.append(oneLineTagged)

    return taggedSentences

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [6]:
# transform sentences to tuples i.e. ()
taggedSents = transformToTuples(y, classes)

# define a 75/25 train/test split
cutoff = int(.75 * len(taggedSents))
training_sentences = taggedSents[:cutoff]
test_sentences = taggedSents[cutoff:]

X, y = transform_to_dataset(training_sentences)

In [7]:
# (pre optimization guess) 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

print('Training beginning ---------------------------------------------')
start = time.time() # timer
clf.fit(X[:10000], y[:10000])   # Use only the first 10K samples if running it multiple times. It takes a fair while :)
end = time.time()
print('Training completed in {:.1f} seconds -----------------------------'.format((end - start)))

X_test, y_test = transform_to_dataset(test_sentences)

Training beginning ---------------------------------------------
Training completed in 15.8 seconds -----------------------------


# Optimization

Pipiline
---

Grid Search
---

In [12]:
print('RUNNING ---------------------------------------------')
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import os

class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

# 1) create pipeline
pipe = Pipeline([
    ('vec', DictVectorizer()),
    ('clf', DummyEstimator())]) # placeholder

# 2) define grid
search_space = [
    {'clf': [KNeighborsClassifier()],
     'clf__n_neighbors': range(1,10),
     'clf__weights': ['uniform', 'distance']
     },
     {'clf': [DecisionTreeClassifier()],
      'clf__criterion': ['gini', 'entropy']
     },
     {'clf': [LogisticRegression()],
      'clf__penalty': ['l1', 'l2'], 
      'clf__solver': ['saga'],#, 'newton-cg', 'lbfgs', 'liblinear', 'sag']#, 
      'clf__multi_class': ['ovr', 'multinomial', 'auto']
     }
]
             
# 3) conduct grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)

# 4) fit and return accuracy & best params
print('\nTraining beginning ---------------------------------------------')
start = time.time() # timer
best_cv = clf.fit(X[:10000], y[:10000]) # Use only the first 10K samples if running it multiple times. It takes a fair while :)
end = time.time()
print('Training completed in {:.1f} seconds -----------------------------'.format((end - start)))

# 5) return best 
best_model = best_cv.best_estimator_.get_params()['clf']

# unsure what this is (?)
X_test, y_test = transform_to_dataset(test_sentences)
accuracy = best_cv.score(X_test, y_test)
print(f"{lr_housing_r2:,.4f}")
print(f"best model: {best_cv.best_estimator_.get_params()['clf']}")

RUNNING ---------------------------------------------

Training beginning ---------------------------------------------




Training completed in 18.5 seconds -----------------------------
0.9301


Best Model: Logit<br>
Best Param: <br>
penalty = 'l2'<br>
solver='saga'<br>
Training completed in 226.1 seconds<br>
Accuracy: 0.9304<br>

Evaluation Metric
----

In [14]:
print("Accuracy: {:.4}".format(clf.score(X_test, y_test)))

Accuracy: 0.9301
