In [17]:
import pandas as pd 
import numpy as np
import math
from sklearn import feature_extraction
from sklearn import svm
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
def load_file(filename):

    X_2 = []
    X_5 = []
    y_2 = []
    y_5 = []

    with open(filename) as f:
        for i,line in enumerate(f):
            index = 0
            indices = []
            for char in line:
                if char == ',':
                    indices.append(index)
                    break
                index += 1
            value = line[0:index]
            tag = ''
            
            if float(value) > 0.8:
                tag = 4
            elif float(value) > 0.6:
                tag = 3
            elif float(value) > 0.4:
                tag = 2
            elif float(value) > 0.2:
                tag = 1
            else:
                tag = 0
            y_5.append(tag)
            X_5.append(line[index+1:])
        
            if float(value) > 0.4 and float(value) <= 0.6:
                continue
            if float(value) > 0.6:
                tag = 1
            elif float(value) <= 0.4:
                tag = 0
                
            text = line[index+1:]
            y_2.append(tag)
            X_2.append(text)
            
    return X_2, X_5,y_2,y_5



In [47]:
X_train_2, X_train_5, y_train_2, y_train_5 = load_file('sst_train_sentences.csv')
X_dev_2, X_dev_5, y_dev_2, y_dev_5 = load_file('sst_dev.csv')
X_test_2, X_test_5, y_test_2, y_test_5 = load_file('sst_test.csv')

In [55]:
# 2 grain classification
vectorizer_2 = feature_extraction.text.CountVectorizer(ngram_range = (1,1), binary=True)
vectorizer_freq_2 = TfidfVectorizer(ngram_range = (1,1))
vectorizer_2.fit(X_train_2 + X_dev_2 + X_test_2)
vectorizer_freq_2.fit(X_train_2 + X_dev_2 + X_test_2)
#bag of words vectors
X_train_bow_2 = (vectorizer_2.transform(X_train_2)).toarray()
X_dev_bow_2 = (vectorizer_2.transform(X_dev_2)).toarray()
X_test_bow_2 = (vectorizer_2.transform(X_test_2)).toarray()

# frequency bag of words vectors
X_train_fbow_2 = (vectorizer_freq_2.transform(X_train_2)).toarray()
X_dev_fbow_2 = (vectorizer_freq_2.transform(X_dev_2)).toarray()
X_test_fbow_2 = (vectorizer_freq_2.transform(X_test_2)).toarray()


# for 5 grained classification
vectorizer_5 = feature_extraction.text.CountVectorizer(ngram_range = (1,1), binary=True)
vectorizer_freq_5 = TfidfVectorizer(ngram_range = (1,1))
vectorizer_5.fit(X_train_5 + X_dev_5 + X_test_5)
vectorizer_freq_5.fit(X_train_5 + X_dev_5 + X_test_5)
#bag of words vectors
X_train_bow_5 = (vectorizer_5.transform(X_train_5)).toarray()
X_dev_bow_5 = (vectorizer_5.transform(X_dev_5)).toarray()
X_test_bow_5 = (vectorizer_5.transform(X_test_5)).toarray()

# frequency bag of words vectors
X_train_fbow_5 = (vectorizer_freq_5.transform(X_train_5)).toarray()
X_dev_fbow_5 = (vectorizer_freq_5.transform(X_dev_5)).toarray()
X_test_fbow_5 = (vectorizer_freq_5.transform(X_test_5)).toarray()


yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes


In [53]:
# 2 grain classification, BBOW
# bag of words
alpha_values = [5,3,1,0.7, 0.5, 0.3, 0.2,0.1]
bestAlpha = 0
bestValidError = 0
for alpha in alpha_values:
    print("alpha : " + str(alpha))
    clf = BernoulliNB(alpha=alpha)
    clf.fit(X_train_bow_2, y_train_2)
    y_pred = clf.predict(X_dev_bow_2)
    score = metrics.accuracy_score(y_dev_2, y_pred)
    print("accuracy")
    print(score)
    if score>bestValidError :
        bestValidError = score
        bestAlpha = alpha

clf = MultinomialNB(alpha=bestAlpha)
clf.fit(X_train_bow_2, y_train_2)
y_pred = clf.predict(X_test_bow_2)
score = metrics.accuracy_score(y_test_2, y_pred)
  
print(bestAlpha)
print(score)


alpha : 5
accuracy
0.7064220183486238
alpha : 3
accuracy
0.7637614678899083
alpha : 1
accuracy
0.801605504587156
alpha : 0.7
accuracy
0.8004587155963303
alpha : 0.5
accuracy
0.7947247706422018
alpha : 0.3
accuracy
0.7901376146788991
alpha : 0.2
accuracy
0.7878440366972477
alpha : 0.1
accuracy
0.7786697247706422
1
0.8171334431630972


In [None]:
# 5 grain classification, BBOW
# bag of words
alpha_values = [5,3,1,0.7, 0.5, 0.3, 0.1, 0.07, 0.05, 0.03, 0.02, 0.01, 0.05, 0.001]
bestAlpha = 0
bestValidError = 0
for alpha in alpha_values:
    print("alpha : " + str(alpha))
    clf = BernoulliNB(alpha=alpha)
    clf.fit(X_train_bow_5, y_train_5)
    y_pred = clf.predict(X_dev_bow_5)
    score = metrics.accuracy_score(y_dev_5, y_pred)
    print("accuracy")
    print(score)
    if score>bestValidError :
        bestValidError = score
        bestAlpha = alpha

clf = MultinomialNB(alpha=bestAlpha)
clf.fit(X_train_bow_5, y_train_5)
y_pred = clf.predict(X_test_bow_5)
score = metrics.accuracy_score(y_test_5, y_pred)
  
print(bestAlpha)
print(score)

alpha : 5
accuracy
0.30636363636363634
alpha : 3
accuracy
0.3472727272727273
alpha : 1
accuracy
0.36363636363636365
alpha : 0.7
accuracy
0.3609090909090909
alpha : 0.5


In [None]:
# 2 grain classification
#frequency bag of words
smoothing_values = [1, 0.1,0.05, 0.03, 0.01, 0.008, 0.005]
bestValue =0
best = 0
for value in smoothing_values:
    print("smoothing : " + str(value))
    clf = GaussianNB(var_smoothing=value)
    clf.fit(X_train_fbow_2, y_train_2)
    y_pred = clf.predict(X_dev_fbow_2)
    score = metrics.accuracy_score(y_dev_2, y_pred)
    print("accuracy : " + str(score))
    if score > best:
        best = score
        bestValue = value

clf = GaussianNB(var_smoothing=bestValue)
clf.fit(X_train_fbow_2, y_train_2)
y_pred = clf.predict(X_test_fbow_2)
score = metrics.accuracy_score(y_test_2, y_pred)
print("best test result for smoothing : " + str(bestValue))
print(score)

In [45]:
# 5 grain classification
#frequency bag of words
smoothing_values = [1, 0.1,0.05, 0.03, 0.01, 0.008, 0.005]
bestValue =0
best = 0
for value in smoothing_values:
    print("smoothing : " + str(value))
    clf = GaussianNB(var_smoothing=value)
    clf.fit(X_train_fbow_5, y_train_5)
    y_pred = clf.predict(X_dev_fbow_5)
    score = metrics.accuracy_score(y_dev_5, y_pred)
    print("accuracy : " + str(score))
    if score > best:
        best = score
        bestValue = value

clf = GaussianNB(var_smoothing=bestValue)
clf.fit(X_train_fbow_5, y_train_5)
y_pred = clf.predict(X_test_fbow_5)
score = metrics.accuracy_score(y_test_5, y_pred)
print("best test result for smoothing : " + str(bestValue))
print(score)

smoothing : 1
accuracy : 0.2863636363636364
smoothing : 0.1
accuracy : 0.2863636363636364
smoothing : 0.05
accuracy : 0.29454545454545455
smoothing : 0.03
accuracy : 0.3009090909090909
smoothing : 0.01
accuracy : 0.31
smoothing : 0.008
accuracy : 0.31363636363636366
smoothing : 0.005
accuracy : 0.3090909090909091
best test result for smoothing : 0.008
0.3054298642533937
