### A3 - NLP for Yelp and IMDB

In [1]:
# Imports
import pandas as pd
import os.path
import re
import numpy as np
import string
from sklearn import metrics
from scipy.sparse import lil_matrix
from scipy.sparse import vstack

In [2]:
# Functions to open files
def open_with_panda(filename):
    return pd.read_csv(filename, sep='\t', names=["Comment", "Evaluation"]).as_matrix()


def open_vocab_file(filename):
    return pd.read_csv(yelp_vocab_file, sep='\t', header=None, skip_blank_lines=True).as_matrix()

In [3]:
# Function to generate encodings
def encode_reviews(file_to_create, X, Y, vocabFile, overwrite=False):
    if not (os.path.isfile(file_to_create)) or overwrite:
        # Grab vocab encodings
        vocab = open_vocab_file(vocabFile)
        vocab_dict = dict([(vocab[i][0], int(vocab[i][1])) for i in range(len(vocab))])
        list_to_write = []

        cleaned_reviews = [re.sub(r'[^\w\s\d]|_','',X[i].lower()) for i in range(len(X))]
        final_reviews = [re.sub(r'\s+',' ',cleaned_reviews[i]).strip() for i in range(len(cleaned_reviews))]

        for i in range(len(final_reviews)):
            build_string = ""
            for x in final_reviews[i].split(" "):
                if x in vocab_dict.keys():
                    build_string += str(vocab_dict[x]) + " "
            list_to_write.append(build_string.strip() + "\t" + str(Y[i]))

        with open(file_to_create,'w') as f:
            f.write("\n".join(list_to_write))

In [4]:
# Function to generate vocab
def vocabulary(file, reviews, overwrite=False):
    if not (os.path.isfile(file)) or overwrite:
        # Let's create the vocab.txt file
        cleaned_reviews = [re.sub(r'[^\w\s\d]|_','',reviews[i].lower()) for i in range(len(reviews))]
        final_reviews = [re.sub(r'\s+',' ',cleaned_reviews[i]).strip() for i in range(len(cleaned_reviews))]

        found_punct = False
        for c in string.punctuation:
            for rev in final_reviews:
                if c in rev:
                    # print(rev + "\n")
                    found_punct = True

        if found_punct:
            raise Exception('Found punctuation')

        words = " ".join(final_reviews).split(" ")
        unique_words, count = np.unique(words, return_counts=True)
        unique_words_array = np.asarray((unique_words,count)).T
        frequency_unique_words = np.flip(unique_words_array[unique_words_array[:,-1].astype(int).argsort()],0)

        with open(file,'w') as f:
            for i in range(10000):
                f.write(
                    frequency_unique_words[i][0] + "\t" +
                    str(i) + "\t" +
                    frequency_unique_words[i][-1] + "\n"
                )


In [5]:
# File paths

yelp_train_file = "hwk3_datasets/yelp-train.txt"
yelp_valid_file = "hwk3_datasets/yelp-valid.txt"
yelp_test_file = "hwk3_datasets/yelp-test.txt"

imdb_valid_file = "hwk3_datasets/IMDB-valid.txt"
imdb_test_file = "hwk3_datasets/IMDB-test.txt"
imdb_train_file = "hwk3_datasets/IMDB-train.txt"

yelp_vocab_file = "data/yelp-vocab.txt"
imdb_vocab_file = "data/IMDB-vocab.txt"

wr_yelp_train_file = "data/yelp-train.txt"
wr_yelp_valid_file = "data/yelp-valid.txt"
wr_yelp_test_file = "data/yelp-test.txt"

wr_imdb_valid_file = "data/IMDB-valid.txt"
wr_imdb_test_file = "data/IMDB-test.txt"
wr_imdb_train_file = "data/IMDB-train.txt"

In [6]:
# Training data

data_yelp_train = open_with_panda(yelp_train_file)
yelp_train_X = data_yelp_train[:,0]
yelp_train_Y = data_yelp_train[:,-1]

data_imdb_train = open_with_panda(imdb_train_file)
imdb_train_X = data_imdb_train[:,0]
imdb_train_Y = data_imdb_train[:,-1]

In [7]:
# Generate vocab
vocabulary(yelp_vocab_file, yelp_train_X.flatten())
vocabulary(imdb_vocab_file, imdb_train_X.flatten())

In [8]:
# Encode yelp reviews

encode_reviews(wr_yelp_train_file, yelp_train_X, yelp_train_Y, yelp_vocab_file)

data_yelp_valid = open_with_panda(yelp_valid_file)
data_yelp_test = open_with_panda(yelp_test_file)

encode_reviews(wr_yelp_valid_file, data_yelp_valid[:,0], data_yelp_valid[:,-1], yelp_vocab_file)
encode_reviews(wr_yelp_test_file, data_yelp_test[:,0], data_yelp_test[:,-1], yelp_vocab_file)

In [9]:
# Encode IMDB reviews
encode_reviews(wr_imdb_train_file, imdb_train_X, imdb_train_Y, imdb_vocab_file)
data_imdb_valid = open_with_panda(imdb_valid_file)
data_imdb_test = open_with_panda(imdb_test_file)

encode_reviews(wr_imdb_valid_file, data_imdb_valid[:,0], data_imdb_valid[:,-1], imdb_vocab_file)
encode_reviews(wr_imdb_test_file, data_imdb_test[:,0], data_imdb_test[:,-1], imdb_vocab_file)

In [10]:
# Function - unif classifier
def uniform_classifier(classes, y_test):
    from random import randint
    l = []
    for i in range(len(y_test)):
        rand = randint(0, len(classes)-1)
        l.append(classes[rand])
    y_pred = np.array(l, dtype=np.int16)
    y_test = np.array(y_test, dtype=np.int16)
    return metrics.f1_score(y_test, y_pred, average=None) # Get F1 score for each of the classes, thus Average=None

In [11]:
# Function - majority classifier
def majority_classifier(classes, y_train, y_test):
    counter = dict([(classes[i], 0) for i in range(len(classes))])

    for y in y_train:
        counter[y]+=1

    majority_class = max(counter, key=counter.get)
    y_pred = np.full((len(y_test),1), majority_class, dtype=np.int16)
    y_test = np.array(y_test, dtype=np.int16)
    
    return metrics.f1_score(y_test, y_pred, average=None)

In [12]:
# Binary bag of words: read encoding file and generate
def binary_bag_of_words(filename):
    features = 10000
    with open(filename, 'r') as f:
        lines = f.readlines()
        X = lil_matrix((len(lines), features), dtype=np.int32)
        for i in range(len(lines)):
            encodings = lines[i].split(" ")
            encodings[-1] = encodings[-1].split("\t")[0]

            if len(encodings) == 1 and encodings[0] == '':
                continue  # We have only 1 encoding, which means the review has no vocab words

            for j in range(len(encodings)):
                e = int(encodings[j])
                X[i,e] = 1
        return X

In [13]:
# Frequency bag of words - with division
def frequency_bag_of_words(filename):
    features = 10000
    with open(filename, 'r') as f:
        lines = f.readlines()
        X = lil_matrix((len(lines), features), dtype=np.int32)
        for i in range(len(lines)):
            encodings = lines[i].split(" ")
            encodings[-1] = encodings[-1].split("\t")[0]
            
            if len(encodings) == 1 and encodings[0] == '':
                continue  # We have only 1 encoding, which means the review has no vocab words
            
            for j in range(len(encodings)):
                e = int(encodings[j])
                X[i,e] += 1
        sum_vector = X.sum(axis=1)
        for i in range(len(sum_vector)):
            if sum_vector[i] == 0:
                sum_vector[i] = 1  #just so we don't divide by 0

        return lil_matrix(X / sum_vector, dtype=np.float64)

In [14]:
yelp_classes = [i for i in range(1,6)]
imdb_classes = [0,1]

In [15]:
unif = uniform_classifier(yelp_classes, data_yelp_test[:,-1].flatten())
maj = majority_classifier(yelp_classes, yelp_train_Y, data_yelp_test[:,-1].flatten()) # Will get an F1 error, since we'll have 0/0 in some cases

print(unif)
print(maj)
print(sum(unif)/len(unif))
print(sum(maj)/len(maj))

[ 0.07279029  0.15224913  0.15273775  0.25341841  0.27134725]
[ 0.         0.         0.         0.5196151  0.       ]
0.180508568834
0.103923019985


  'precision', 'predicted', average, warn_for)


In [16]:
# Yelp Binary Bag of Words

binary_bag_train = binary_bag_of_words(wr_yelp_train_file)
binary_bag_valid = binary_bag_of_words(wr_yelp_valid_file)
binary_bag_test = binary_bag_of_words(wr_yelp_test_file)

In [19]:
# Get data ready for cross gridsearchCV with validation data

yelp_train_valid_X = vstack([binary_bag_train, binary_bag_valid])
train_test_fold = np.full((binary_bag_train.shape[0]), 0)
valid_test_fold = np.full((binary_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

yelp_valid_Y = np.array(data_yelp_valid[:,-1], dtype=np.int32)

yelp_train_valid_Y = np.array(np.append(yelp_train_Y, yelp_valid_Y, axis=0), dtype=np.int32)

In [34]:
# Naive Bayes for Yelp Binary Bag of Words
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

# clf = BernoulliNB()
# tuned_parameters = [{'alpha': np.arange(0.01, 1.01, 0.01)}]
# ps = PredefinedSplit(test_fold=test_fold)

# clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
# clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

# y_pred = clf.predict(binary_bag_test)
maximum = -1
bestAlpha = -1
yelp_train_Y = np.array(yelp_train_Y, dtype=np.int32)

# for al in np.arange(0.01, 1.01, 0.01):
#     clf = BernoulliNB(alpha=al)
#     clf.fit(binary_bag_train, yelp_train_Y)
#     y_pred_train = clf.predict(binary_bag_train)
#     y_pred_valid = clf.predict(binary_bag_valid)
#     f1_t = metrics.f1_score(yelp_train_Y,y_pred_train, average=None)
#     f1_v = metrics.f1_score(yelp_valid_Y,y_pred_valid, average=None)
#     print(al)
#     print(np.mean(f1_t) + np.mean(f1_v))
#     if np.mean(f1_t) + np.mean(f1_v) > maximum:
#         maximum = np.mean(f1_t) + np.mean(f1_v)
#         bestAlpha = al
clf = BernoulliNB(alpha=bestAlpha)
clf.fit(binary_bag_train, yelp_train_Y)
y_pred = clf.predict(binary_bag_test)
# print(clf.best_estimator_)
# print(metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))

In [35]:
print(bestAlpha)
f1 = metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None)
print(f1)
print(np.mean(f1))

-1
[ 0.42798354  0.27355623  0.24254473  0.44773907  0.52538071]
0.383440855669


In [52]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 8000 in training + valid; so solve dual
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge', 'hinge'], 
     'dual':[True], 
    'C':np.arange(0.01, 1.01, 0.01),
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

y_pred = clf.predict(binary_bag_test)

print(metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))
print(metrics.accuracy_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred))
print(clf.best_estimator_)

[ 0.52702703  0.30927835  0.32793522  0.51590595  0.61371351]
0.5145
LinearSVC(C=0.02, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)


In [56]:
# Decision tree for yelp binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {'criterion': ['gini','entropy'],
#      'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150,None]
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

y_pred = clf.predict(binary_bag_test)

print(metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))
print(clf.best_estimator_)

[ 0.17021277  0.14534884  0.19649123  0.38294993  0.43589744]
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [59]:
# Yelp frequency bag of words
freq_bag_train = frequency_bag_of_words(wr_yelp_train_file)
freq_bag_valid = frequency_bag_of_words(wr_yelp_valid_file)
freq_bag_test = frequency_bag_of_words(wr_yelp_test_file)

In [64]:
# Get data ready for cross gridsearchCV with validation data

yelp_train_valid_X = vstack([freq_bag_train, freq_bag_valid])
train_test_fold = np.full((freq_bag_train.shape[0]), 0)
valid_test_fold = np.full((freq_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

yelp_valid_Y = np.array(data_yelp_valid[:,-1], dtype=np.int32)

yelp_train_valid_Y = np.array(np.append(yelp_train_Y, yelp_valid_Y, axis=0), dtype=np.int32)

In [66]:
# Naive Bayes for Yelp Binary Bag of Words
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
tuned_parameters = [{}]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(yelp_train_valid_X.todense(), yelp_train_valid_Y)

y_pred = clf.predict(freq_bag_test.todense())

print(clf.best_estimator_)
print(metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))

GaussianNB(priors=None)
[ 0.17940199  0.140625    0.21021021  0.33888487  0.37697161]


In [67]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 8000 in training + valid; so solve dual
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge', 'hinge'], 
     'dual':[True], 
    'C':np.arange(0.01, 1.01, 0.01),
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

y_pred = clf.predict(freq_bag_test)

print(clf.best_estimator_)
print(metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))

LinearSVC(C=0.17000000000000001, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
[ 0.41860465  0.21960784  0.16161616  0.42801556  0.62236988]


In [68]:
# Decision tree for yelp binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {'criterion': ['gini','entropy'],
#      'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150,None]
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(yelp_train_valid_X, yelp_train_valid_Y)

y_pred = clf.predict(freq_bag_test)

print(metrics.f1_score(np.array(data_yelp_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))
print(clf.best_estimator_)

[ 0.22875817  0.13031161  0.18671454  0.38787024  0.44070278]
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


## IMDB

In [90]:
unif_imdb = uniform_classifier(imdb_classes, data_imdb_test[:,-1].flatten())

print(unif_imdb)

[ 0.50151976  0.50136022]


In [91]:
# IMDB binary bag of words

binary_bag_train = binary_bag_of_words(wr_imdb_train_file)
binary_bag_valid = binary_bag_of_words(wr_imdb_valid_file)
binary_bag_test = binary_bag_of_words(wr_imdb_test_file)

In [92]:
# Get data ready for cross gridsearchCV with validation data

imdb_train_valid_X = vstack([binary_bag_train, binary_bag_valid])
train_test_fold = np.full((binary_bag_train.shape[0]), 0)
valid_test_fold = np.full((binary_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

imdb_valid_Y = np.array(data_imdb_valid[:,-1], dtype=np.int32)

imdb_train_valid_Y = np.array(np.append(imdb_train_Y, imdb_valid_Y, axis=0), dtype=np.int32)

In [93]:
# Naive Bayes for IMDB Binary Bag of Words
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

clf = BernoulliNB()
tuned_parameters = [{'alpha': np.arange(0.01, 1.01, 0.01)}]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

y_pred = clf.predict(binary_bag_test)

print(clf.best_estimator_)
print(metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))

BernoulliNB(alpha=0.96999999999999997, binarize=0.0, class_prior=None,
      fit_prior=True)
[ 0.83683619  0.8377214 ]


In [95]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 25000 in training + valid; so solve primal (which can't be used with hinge loss)
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge'], 
     'dual':[False], 
    'C':np.arange(0.01, 1.01, 0.01),
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

y_pred = clf.predict(binary_bag_test)

print(metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))
print(clf.best_estimator_)

[ 0.87187953  0.87354952]
LinearSVC(C=0.01, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [78]:
# Decision tree for yelp binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {'criterion': ['gini','entropy'],
#      'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150,None]
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

y_pred = clf.predict(binary_bag_test)

print(metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))
print(clf.best_estimator_)

[ 0.70212425  0.70331337]
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [96]:
# Yelp frequency bag of words
freq_bag_train = frequency_bag_of_words(wr_imdb_train_file)
freq_bag_valid = frequency_bag_of_words(wr_imdb_valid_file)
freq_bag_test = frequency_bag_of_words(wr_imdb_test_file)

In [99]:
# Get data ready for cross gridsearchCV with validation data

imdb_train_valid_X = vstack([freq_bag_train, freq_bag_valid])
train_test_fold = np.full((freq_bag_train.shape[0]), 0)
valid_test_fold = np.full((freq_bag_valid.shape[0]), 1)
test_fold = np.append(train_test_fold, valid_test_fold, axis=0)

imdb_valid_Y = np.array(data_imdb_valid[:,-1], dtype=np.int32)

imdb_train_valid_Y = np.array(np.append(imdb_train_Y, imdb_valid_Y, axis=0), dtype=np.int32)

In [100]:
# Naive Bayes for IMDB Binary Bag of Words
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
tuned_parameters = [{}]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(imdb_train_valid_X.todense(), imdb_train_valid_Y)

y_pred = clf.predict(freq_bag_test.todense())

print(clf.best_estimator_)
print(metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))

GaussianNB(priors=None)
[ 0.65187433  0.66867047]


In [101]:
# Linear SVC/SVM
from sklearn.svm import LinearSVC
clf = LinearSVC()

# We have 10000 features and 25000 in training + valid; so solve primal (can't use hinge loss with primal problem)
tuned_parameters = [
    {'penalty': ['l2'], 
     'loss':['squared_hinge'], 
     'dual':[False], 
    'C':np.arange(0.01, 1.01, 0.01),
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

y_pred = clf.predict(freq_bag_test)

print(clf.best_estimator_)
print(metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
[ 0.81205246  0.81417548]


In [84]:
# Decision tree for yelp binary bag of words
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

tuned_parameters = [
    {'criterion': ['gini','entropy'],
#      'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150,None]
    }]
ps = PredefinedSplit(test_fold=test_fold)

clf = GridSearchCV(clf, tuned_parameters, cv=ps, refit=True)
clf.fit(imdb_train_valid_X, imdb_train_valid_Y)

y_pred = clf.predict(freq_bag_test)

print(metrics.f1_score(np.array(data_imdb_test[:,-1].flatten(), dtype=np.int32), y_pred, average=None))
print(clf.best_estimator_)

[ 0.70260579  0.70026504]
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
