In [0]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import sklearn.metrics
import sklearn.naive_bayes
import sklearn.tree
import sklearn.svm

In [0]:
# read yelp and imdb csv files
yelp_test = pd.read_csv('given_yelp-test.txt', sep = "\t", header = None)
yelp_train = pd.read_csv('given_yelp-train.txt', sep = "\t", header = None)
yelp_valid = pd.read_csv('given_yelp-valid.txt', sep = "\t", header = None)

imdb_test = pd.read_csv('given_IMDB-test.txt', sep = "\t", header = None)
imdb_train = pd.read_csv('given_IMDB-train.txt', sep = "\t", header = None)
imdb_valid = pd.read_csv('given_IMDB-valid.txt', sep = "\t", header = None)

In [0]:
# question 1: convert the review to a fixed length vector representation(binary bag-of-words and frequency bag-of-words)
# convert both the datasets into both these representations

In [0]:
# pick top 10,000 words in the vocabulary and ignore the rest of the words
def pickTopWords(reviews):
    words = []
    topWords = []
    
    for review in reviews[0]:
        # required tasks: punctuation removal and lower-casing the words
        words.extend(review.lower().translate(str.maketrans("","", string.punctuation)).split(' '))
    
    ctr = Counter()
    for word in words:
        if (word != ''):
            ctr[word] += 1
        
    topWords = ctr.most_common(10000) # pick top 10,000 words
    return {tupl[0]: index for index, tupl in enumerate(topWords)}, [tupl[0] + '\t' + str(index) + '\t' + str(tupl[1])  for index, tupl in enumerate(topWords)]
    
# we only need to consider the training set
yelp_topWords, yelp_output = pickTopWords(yelp_train)
imdb_topWords, imdb_output = pickTopWords(imdb_train)

# save the vocabulary of the two datasets into .txt files
f = open("yelp-vocab.txt", 'w',encoding='utf-8')
ctr = 0
for line in yelp_output:
    if ctr == 0:  
        f.write(line)
    else:
        f.write('\n')
        f.write(line)
    ctr += 1
f.close()

f = open("IMDB-vocab.txt", 'w',encoding='utf-8')
ctr = 0
for line in imdb_output:
    if ctr == 0:  
        f.write(line)
    else:
        f.write('\n')
        f.write(line)
    ctr += 1
f.close()

In [0]:
# each word in the vocabulary has a corresponding numeric id and frequency all tab separated
def saveReviewsIDs(dic, reviews, filename):
    f = open(filename, 'w',encoding='utf-8')
    ctr = 0

    for review, categ in zip(reviews[0], reviews[1]):
        if ctr != 0:
            f.write('\n')

        r = review.lower().translate(str.maketrans("","", string.punctuation)).split(' ')

        for index, word in enumerate(r):
            if word in dic:
                if index != 0:
                    f.write(' ')
                f.write(str(dic[word]))
        f.write('\t')
        f.write(str(categ))
        ctr += 1

    f.close()

In [0]:
saveReviewsIDs(yelp_topWords, yelp_test, "yelp-test.txt")
saveReviewsIDs(yelp_topWords, yelp_train, "yelp-train.txt")
saveReviewsIDs(yelp_topWords, yelp_valid, "yelp-valid.txt")

saveReviewsIDs(imdb_topWords, imdb_test, "IMDB-test.txt")
saveReviewsIDs(imdb_topWords, imdb_train, "IMDB-train.txt")
saveReviewsIDs(imdb_topWords, imdb_valid, "IMDB-valid.txt")

In [0]:
# for each of the top 10000 words, there is one corresponding dimension in the feature vector 
# that is 1 if the example contains the word, and 0 otherwise
def bin_bow_vector_gen(topWords, reviews):
    m = []
    for review in reviews[0]:
        vector = [0] * len(topWords)
        for word in review:
            if word in topWords:
                vector[topWords[word]] = 1
        m.append(vector)
    return np.array(m)

# for each of the 10000 words, the corresponding feature is the frequency of occurrence of that word in the given review.
# calculate the frequency by summing the occurences of words in a review
# and then divide by the sum of occurrences of all 10000 words so that the vector for each example sums to 1
def freq_bow_vector_gen(topWords, reviews):
    m = []
    for review in reviews[0]:
        vector = [0] * len(topWords)
        for word in review:
            if word in topWords:
                vector[topWords[word]] += 1
                
        s = sum(vector)
        if s > 0:
            vector = np.divide(vector, s)
        m.append(vector)
    return np.array(m)

In [0]:
# convert both the datasets into both these representations
yelp_train_bin_bow = bin_bow_vector_gen(yelp_topWords, yelp_train)
yelp_test_bin_bow = bin_bow_vector_gen(yelp_topWords, yelp_test)
yelp_valid_bin_bow = bin_bow_vector_gen(yelp_topWords, yelp_valid)

yelp_train_freq_bow = freq_bow_vector_gen(yelp_topWords, yelp_train)
yelp_test_freq_bow = freq_bow_vector_gen(yelp_topWords, yelp_test)
yelp_valid_freq_bow = freq_bow_vector_gen(yelp_topWords, yelp_valid)

imdb_train_bin_bow = bin_bow_vector_gen(imdb_topWords, imdb_train)
imdb_test_bin_bow = bin_bow_vector_gen(imdb_topWords, imdb_test)
imdb_valid_bin_bow = bin_bow_vector_gen(imdb_topWords, imdb_valid)

imdb_train_freq_bow = freq_bow_vector_gen(imdb_topWords, imdb_train)
imdb_test_freq_bow = freq_bow_vector_gen(imdb_topWords, imdb_test)
imdb_valid_freq_bow = freq_bow_vector_gen(imdb_topWords, imdb_valid)

# end of question 1

In [0]:
# question 2: yelp dataset with binary bag-of-words representation
# use the F1-measure as the evaluation metric

In [0]:
# report the performance of the random classifier
random = np.random.choice([1,2,3,4,5], len(yelp_test[1]))
sklearn.metrics.f1_score(yelp_test[1], random, average = 'micro')

0.1955

In [0]:
# report the performance of the majority-class classifier
majority = np.argmax(np.bincount(yelp_train[1]))
majority_array = np.array([majority]*len(yelp_test[1]))
sklearn.metrics.f1_score(yelp_test[1], majority_array, average = 'micro')


0.351

In [0]:
# train Naive Bayes with Bernoulli Naive Bayes
BNB = sklearn.naive_bayes.BernoulliNB()
BNB.fit(yelp_train_bin_bow, yelp_train[1])
predictions = BNB.predict(yelp_test_bin_bow)
sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')

0.3645

In [0]:
# Naive Bayes hyper-parameter tuning
alphas = np.linspace(2, 4, 50)
f1s = []
for a in alphas:
    BNB = sklearn.naive_bayes.BernoulliNB(alpha = a)
    BNB.fit(yelp_train_bin_bow, yelp_train[1])
    predictions = BNB.predict(yelp_valid_bin_bow)
    f1s.append(sklearn.metrics.f1_score(yelp_valid[1], predictions, average = 'micro'))

bestAlpha = alphas[np.argmax(f1s)]
print("The best alpha is " + str(bestAlpha))
print("The maximum F-Measure for valid is " + str(np.max(f1s)))

BNB = sklearn.naive_bayes.BernoulliNB(alpha = bestAlpha)
BNB.fit(yelp_train_bin_bow, yelp_train[1])
predictions = BNB.predict(yelp_test_bin_bow)
print("The F-Measure for test is " + str(sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')))


The best alpha is 2.693877551020408
The maximum F-Measure for valid is 0.369
The F-Measure for test is 0.36250000000000004


In [0]:
# train Decision Trees
DT = sklearn.tree.DecisionTreeClassifier()
DT.fit(yelp_train_bin_bow, yelp_train[1])
predictions = DT.predict(yelp_test_bin_bow)
sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')

0.3215

In [0]:
# Decision Trees hyper-parameter tuning
f1s = []
criterion = ["gini", "entropy"]
splitter = ["best", "random"]

for c in criterion:
    for s in splitter:
        DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
        DT.fit(yelp_train_bin_bow, yelp_train[1])
        predictions = DT.predict(yelp_valid_bin_bow)
        f1s.append(sklearn.metrics.f1_score(yelp_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0 or arg == 1:
    c = "gini"
else:
    c = "entropy"
if arg == 0 or arg == 2:
    s = "best"
else:
    s = "random"
    
print("For criteria " + str(c) + " and splitter " + str(s))

DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
DT.fit(yelp_train_bin_bow, yelp_train[1])
predictions = DT.predict(yelp_test_bin_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')))


The highest F-Measure for valid is 0.313
For criteria entropy and splitter random
the F-Measure for test is 0.3265


In [0]:
# train Linear SVM
svm = sklearn.svm.LinearSVC()
svm.fit(yelp_train_bin_bow, yelp_train[1])
predictions = svm.predict(yelp_test_bin_bow)
sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')


0.37050000000000005

In [0]:
# Linear SVM hyper-parameter tuning
f1s = []
penalty = ["l1", "l2"]
loss = ["hinge", "squared_hinge"]

for p in penalty:
    if p == "l1": # combination of "l1" and ("hinge" or "squared_hinge") is not supported
        svm = sklearn.svm.LinearSVC(penalty = p, dual = False)
        svm.fit(yelp_train_bin_bow, yelp_train[1])
        predictions = svm.predict(yelp_valid_bin_bow)
        f1s.append(sklearn.metrics.f1_score(yelp_valid[1], predictions, average = 'micro'))
    else:
        for l in loss:
            svm = sklearn.svm.LinearSVC(penalty = p, loss = l)
            svm.fit(yelp_train_bin_bow, yelp_train[1])
            predictions = svm.predict(yelp_valid_bin_bow)
            f1s.append(sklearn.metrics.f1_score(yelp_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0:
    p = "l1"
    l = "-"
elif arg == 1:
    p = "l2"
    l = "hinge"
elif arg == 2:
    p = "l2"
    l = "squared_hinge"
    
print("For penalty " + str(p) + " and loss " + str(l))

if l == "-":
    svm = sklearn.svm.LinearSVC(penalty = p, dual = False)
else:
    svm = sklearn.svm.LinearSVC(penalty = p, loss = l)
svm.fit(yelp_train_bin_bow, yelp_train[1])
predictions = svm.predict(yelp_test_bin_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')))

# end of question 2

The highest F-Measure for valid is 0.347
For penalty l1 and loss -
the F-Measure for test is 0.3685


In [0]:
# question 3: repeat question 2 but with frequency bag-of-words representation

In [0]:
# train Naive Bayes with Gaussian Naive Bayes
GNB = sklearn.naive_bayes.GaussianNB()
GNB.fit(yelp_train_freq_bow, yelp_train[1])
predictions = GNB.predict(yelp_test_freq_bow)
sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')

0.2865

In [0]:
# train Decision Trees
DT = sklearn.tree.DecisionTreeClassifier()
DT.fit(yelp_train_freq_bow, yelp_train[1])
predictions = DT.predict(yelp_test_freq_bow)
sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')

0.29

In [0]:
# Decision Trees hyper-parameter tuning
f1s = []
criterion = ["gini", "entropy"]
splitter = ["best", "random"]

for c in criterion:
    for s in splitter:
        DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
        DT.fit(yelp_train_freq_bow, yelp_train[1])
        predictions = DT.predict(yelp_valid_freq_bow)
        f1s.append(sklearn.metrics.f1_score(yelp_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0 or arg == 1:
    c = "gini"
else:
    c = "entropy"
if arg == 0 or arg == 2:
    s = "best"
else:
    s = "random"
    
print("For criteria " + str(c) + " and splitter " + str(s))

DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
DT.fit(yelp_train_freq_bow, yelp_train[1])
predictions = DT.predict(yelp_test_freq_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')))

The highest F-Measure for valid is 0.28
For criteria entropy and splitter random
the F-Measure for test is 0.295


In [0]:
# train Linear SVM
svm = sklearn.svm.LinearSVC()
svm.fit(yelp_train_freq_bow, yelp_train[1])
predictions = svm.predict(yelp_test_freq_bow)
sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')

0.39050000000000007

In [0]:
# Linear SVM hyper-parameter tuning
f1s = []
penalty = ["l1", "l2"]
loss = ["hinge", "squared_hinge"]

for p in penalty:
    if p == "l1": # combination of "l1" and ("hinge" or "squared_hinge") is not supported
        svm = sklearn.svm.LinearSVC(penalty = p, dual = False)
        svm.fit(yelp_train_freq_bow, yelp_train[1])
        predictions = svm.predict(yelp_valid_freq_bow)
        f1s.append(sklearn.metrics.f1_score(yelp_valid[1], predictions, average = 'micro'))
    else:
        for l in loss:
            svm = sklearn.svm.LinearSVC(penalty = p, loss = l)
            svm.fit(yelp_train_freq_bow, yelp_train[1])
            predictions = svm.predict(yelp_valid_freq_bow)
            f1s.append(sklearn.metrics.f1_score(yelp_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0:
    p = "l1"
    l = "-"
elif arg == 1:
    p = "l2"
    l = "hinge"
elif arg == 2:
    p = "l2"
    l = "squared_hinge"
    
print("For penalty " + str(p) + " and loss " + str(l))

if l == "-":
    svm = sklearn.svm.LinearSVC(penalty = p, dual = False)
else:
    svm = sklearn.svm.LinearSVC(penalty = p, loss = l)
svm.fit(yelp_train_freq_bow, yelp_train[1])
predictions = svm.predict(yelp_test_freq_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(yelp_test[1], predictions, average = 'micro')))

# end of question 3

The highest F-Measure for valid is 0.377
For penalty l2 and loss squared_hinge
the F-Measure for test is 0.39050000000000007


In [0]:
# question 4.1: repeat question 2 but with IMDB dataset

In [0]:
# report the performance of the random classifier
rand = np.random.choice([0,1], len(imdb_test[1]))
sklearn.metrics.f1_score(imdb_test[1], rand, average = 'micro')

0.50272

In [0]:
# report the performance of the majority-class classifier
majority = np.argmax(np.bincount(imdb_train[1]))
majority_array = np.array([majority]*len(imdb_test[1]))
sklearn.metrics.f1_score(imdb_test[1], majority_array, average = 'micro')

0.5

In [0]:
# train Naive Bayes with Bernoulli Naive Bayes
BNB = sklearn.naive_bayes.BernoulliNB()
BNB.fit(imdb_train_bin_bow, imdb_train[1])
predictions = BNB.predict(imdb_test_bin_bow)
sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')

0.55116

In [0]:
# Naive Bayes hyper-parameter tuning
alphas = np.linspace(3, 5, 50)
f1s = []
for a in alphas:
    BNB = sklearn.naive_bayes.BernoulliNB(alpha = a)
    BNB.fit(imdb_train_bin_bow, imdb_train[1])
    predictions = BNB.predict(imdb_valid_bin_bow)
    f1s.append(sklearn.metrics.f1_score(imdb_valid[1], predictions, average = 'micro'))

bestAlpha = alphas[np.argmax(f1s)]
print("The best alpha is " + str(bestAlpha))
print("The maximum F-Measure for valid is " + str(np.max(f1s)))

BNB = sklearn.naive_bayes.BernoulliNB(alpha = bestAlpha)
BNB.fit(imdb_train_bin_bow, imdb_train[1])
predictions = BNB.predict(imdb_test_bin_bow)
print("The F-Measure for test is " + str(sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')))


The best alpha is 3.979591836734694
The maximum F-Measure for valid is 0.543
The F-Measure for test is 0.5512


In [0]:
# train Decision Trees
DT = sklearn.tree.DecisionTreeClassifier()
DT.fit(imdb_train_bin_bow, imdb_train[1])
predictions = DT.predict(imdb_test_bin_bow)
sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')


0.53972

In [0]:
# Decision Trees hyper-parameter tuning
f1s = []
criterion = ["gini", "entropy"]
splitter = ["best", "random"]

for c in criterion:
    for s in splitter:
        DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
        DT.fit(imdb_train_bin_bow, imdb_train[1])
        predictions = DT.predict(imdb_valid_bin_bow)
        f1s.append(sklearn.metrics.f1_score(imdb_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0 or arg == 1:
    c = "gini"
else:
    c = "entropy"
if arg == 0 or arg == 2:
    s = "best"
else:
    s = "random"
    
print("For criteria " + str(c) + " and splitter " + str(s))

DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
DT.fit(imdb_train_bin_bow, imdb_train[1])
predictions = DT.predict(imdb_test_bin_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')))


The highest F-Measure for valid is 0.537
For criteria entropy and splitter random
the F-Measure for test is 0.53988


In [0]:
# train Linear SVM
svm = sklearn.svm.LinearSVC()
svm.fit(imdb_train_bin_bow, imdb_train[1])
predictions = svm.predict(imdb_test_bin_bow)
sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')


0.55664

In [0]:
# Linear SVM hyper-parameter tuning
f1s = []
penalty = ["l1", "l2"]
loss = ["hinge", "squared_hinge"]

for p in penalty:
    if p == "l1": # combination of "l1" and ("hinge" or "squared_hinge") is not supported
        svm = sklearn.svm.LinearSVC(penalty = p, dual = False)
        svm.fit(imdb_train_bin_bow, imdb_train[1])
        predictions = svm.predict(imdb_valid_bin_bow)
        f1s.append(sklearn.metrics.f1_score(imdb_valid[1], predictions, average = 'micro'))
    else:
        for l in loss:
            svm = sklearn.svm.LinearSVC(penalty = p, loss = l)
            svm.fit(imdb_train_bin_bow, imdb_train[1])
            predictions = svm.predict(imdb_valid_bin_bow)
            f1s.append(sklearn.metrics.f1_score(imdb_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0:
    p = "l1"
    l = "-"
elif arg == 1:
    p = "l2"
    l = "hinge"
elif arg == 2:
    p = "l2"
    l = "squared_hinge"
    
print("For penalty " + str(p) + " and loss " + str(l))

if l == "-":
    svm = sklearn.svm.LinearSVC(penalty = p, dual = False)
else:
    svm = sklearn.svm.LinearSVC(penalty = p, loss = l)
svm.fit(imdb_train_bin_bow, imdb_train[1])
predictions = svm.predict(imdb_test_bin_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')))

# end of question 4.1 

The highest F-Measure for valid is 0.5508
For penalty l2 and loss hinge
the F-Measure for test is 0.55628


In [0]:
# question 4.2: repeat question 3 but with IMDB dataset

In [0]:
# train Naive Bayes with Gaussian Naive Bayes
GNB = sklearn.naive_bayes.GaussianNB()
GNB.fit(imdb_train_freq_bow, imdb_train[1])
predictions = GNB.predict(imdb_test_freq_bow)
sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')


0.51632

In [0]:
# train Decision Trees
DT = sklearn.tree.DecisionTreeClassifier()
DT.fit(imdb_train_freq_bow, imdb_train[1])
predictions = DT.predict(imdb_test_freq_bow)
sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')


0.536

In [0]:
# Decision Trees hyper-parameter tuning
f1s = []
criterion = ["gini", "entropy"]
splitter = ["best", "random"]

for c in criterion:
    for s in splitter:
        DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
        DT.fit(imdb_train_freq_bow, imdb_train[1])
        predictions = DT.predict(imdb_valid_freq_bow)
        f1s.append(sklearn.metrics.f1_score(imdb_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0 or arg == 1:
    c = "gini"
else:
    c = "entropy"
if arg == 0 or arg == 2:
    s = "best"
else:
    s = "random"
    
print("For criteria " + str(c) + " and splitter " + str(s))

DT = sklearn.tree.DecisionTreeClassifier(criterion = c, splitter = s)
DT.fit(imdb_train_freq_bow, imdb_train[1])
predictions = DT.predict(imdb_test_freq_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')))


The highest F-Measure for valid is 0.5493
For criteria entropy and splitter best
the F-Measure for test is 0.5408


In [0]:
# train Linear SVM
svm = sklearn.svm.LinearSVC()
svm.fit(imdb_train_freq_bow, imdb_train[1])
predictions = svm.predict(imdb_test_freq_bow)
sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')


0.61228

In [0]:
# Linear SVM hyper-parameter tuning
f1s = []
penalty = ["l1", "l2"]
loss = ["hinge", "squared_hinge"]

for p in penalty:
    if p == "l1": # combination of "l1" and ("hinge" or "squared_hinge") is not supported
        svm = sklearn.svm.LinearSVC(penalty=p, dual=False)
        svm.fit(imdb_train_freq_bow, imdb_train[1])
        predictions = svm.predict(imdb_valid_freq_bow)
        f1s.append(sklearn.metrics.f1_score(imdb_valid[1], predictions, average = 'micro'))
    else:
        for l in loss:
            svm = sklearn.svm.LinearSVC(penalty=p, loss=l)
            svm.fit(imdb_train_freq_bow, imdb_train[1])
            predictions = svm.predict(imdb_valid_freq_bow)
            f1s.append(sklearn.metrics.f1_score(imdb_valid[1], predictions, average = 'micro'))
        
highestF1 = np.max(f1s)
print("The highest F-Measure for valid is "+ str(highestF1))
arg = np.argmax(f1s)
if arg == 0:
    p = "l1"
    l = "-"
elif arg == 1:
    p = "l2"
    l = "hinge"
elif arg == 2:
    p = "l2"
    l = "squared_hinge"
    
print("For penalty " + str(p) + " and loss " + str(l))

if l == "-":
    svm = sklearn.svm.LinearSVC(penalty = p, dual = False)
else:
    svm = sklearn.svm.LinearSVC(penalty = p, loss = l)
svm.fit(imdb_train_freq_bow, imdb_train[1])
predictions = svm.predict(imdb_test_freq_bow)
print("the F-Measure for test is " + str(sklearn.metrics.f1_score(imdb_test[1], predictions, average = 'micro')))

# end of question 4.2


The highest F-Measure for valid is 0.6205
For penalty l1 and loss -
the F-Measure for test is 0.61676
