In [1]:
import pandas as pd 
import numpy as np
import math
from sklearn import feature_extraction
from sklearn import svm
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
def load_file(filename):

    X_2 = []
    X_5 = []
    y_2 = []
    y_5 = []

    with open(filename) as f:
        for i,line in enumerate(f):
            index = 0
            indices = []
            for char in line:
                if char == ',':
                    indices.append(index)
                    break
                index += 1
            value = line[0:index]
            tag = ''
            
            if float(value) > 0.8:
                tag = 4
            elif float(value) > 0.6:
                tag = 3
            elif float(value) > 0.4:
                tag = 2
            elif float(value) > 0.2:
                tag = 1
            else:
                tag = 0
            y_5.append(tag)
            words = tokenizer.tokenize(line[index+1:])
            for i in range(len(words)):
                words[i] = lemmatizer.lemmatize(words[i].lower())
            X_5.append(" ".join(words))
#             X_5.append(line[index+1:])
        
            if float(value) > 0.4 and float(value) <= 0.6:
                continue
            if float(value) > 0.6:
                tag = 1
            elif float(value) <= 0.4:
                tag = 0   
            words = tokenizer.tokenize(line[index+1:])
            for i in range(len(words)):
                words[i] = lemmatizer.lemmatize(words[i].lower())
            X_2.append(" ".join(words))
#             X_2.append(line[index+1:])
            y_2.append(tag)

            
    return X_2, X_5,y_2,y_5



In [20]:
X_train_2, X_train_5, y_train_2, y_train_5 = load_file('sst_train_sentences.csv')
X_dev_2, X_dev_5, y_dev_2, y_dev_5 = load_file('sst_dev.csv')
X_test_2, X_test_5, y_test_2, y_test_5 = load_file('sst_test.csv')

In [21]:
# 2 grain classification
vectorizer_2 = feature_extraction.text.CountVectorizer(ngram_range = (1,1), binary=True)
vectorizer_freq_2 = TfidfVectorizer(ngram_range = (1,2))
vectorizer_2.fit(X_train_2 + X_dev_2 + X_test_2)
vectorizer_freq_2.fit(X_train_2 + X_dev_2 + X_test_2)
#bag of words vectors
X_train_bow_2 = (vectorizer_2.transform(X_train_2))
X_dev_bow_2 = (vectorizer_2.transform(X_dev_2))
X_test_bow_2 = (vectorizer_2.transform(X_test_2))

# frequency bag of words vectors
X_train_fbow_2 = (vectorizer_freq_2.transform(X_train_2))
X_dev_fbow_2 = (vectorizer_freq_2.transform(X_dev_2))
X_test_fbow_2 = (vectorizer_freq_2.transform(X_test_2))


# for 5 grained classification
vectorizer_5 = feature_extraction.text.CountVectorizer(ngram_range = (1,1), binary=True)
vectorizer_freq_5 = TfidfVectorizer(ngram_range = (1,1))
vectorizer_5.fit(X_train_5 + X_dev_5 + X_test_5)
vectorizer_freq_5.fit(X_train_5 + X_dev_5 + X_test_5)
#bag of words vectors
X_train_bow_5 = (vectorizer_5.transform(X_train_5))
X_dev_bow_5 = (vectorizer_5.transform(X_dev_5))
X_test_bow_5 = (vectorizer_5.transform(X_test_5))

# frequency bag of words vectors
X_train_fbow_5 = (vectorizer_freq_5.transform(X_train_5))
X_dev_fbow_5 = (vectorizer_freq_5.transform(X_dev_5))
X_test_fbow_5 = (vectorizer_freq_5.transform(X_test_5))


In [22]:
# 2 grain classification, BBOW
# unigram bag of words
alpha_values = np.linspace(1e-5, 1, 2000)
bestAlpha = 0
bestValidError = 0
for alpha in alpha_values:
    clf = BernoulliNB(alpha=alpha)
    clf.fit(X_train_bow_2, y_train_2)
    y_pred = clf.predict(X_dev_bow_2)
    score = metrics.accuracy_score(y_dev_2, y_pred)
    if score>bestValidError :
        bestValidError = score
        bestAlpha = alpha

X_final = vectorizer_2.transform(X_train_2 + X_dev_2)
y_final = y_train_2 + y_dev_2
clf = BernoulliNB(alpha=bestAlpha)
clf.fit(X_final, y_final)
y_pred = clf.predict(X_test_bow_2)
score = metrics.accuracy_score(y_test_2, y_pred)
  
print(bestAlpha)
print(score)


0.8954487693846923
0.8165842943437671


In [129]:
# 5 grain classification, BBOW
# unigram bag of words
alpha_values = np.linspace(1e-5, 2, 2000)
bestAlpha = 0
bestValidError = 0
for alpha in alpha_values:
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X_train_bow_5, y_train_5)
    y_pred = clf.predict(X_dev_bow_5)
    score = metrics.accuracy_score(y_dev_5, y_pred)
    if score>bestValidError :
        bestValidError = score
        bestAlpha = alpha

X_final = vectorizer_5.transform(X_train_5 + X_dev_5)
y_final = y_train_5 + y_dev_5
clf = MultinomialNB(alpha=bestAlpha)
clf.fit(X_final, y_final)
y_pred = clf.predict(X_test_bow_5)
score = metrics.accuracy_score(y_test_5, y_pred)
  
print(bestAlpha)
print(score)

1.4237147373686845
0.4063348416289593


In [130]:
# # 2 grain classification
# #frequency unigram bag of words
# smoothing_values = np.linspace(1e-5, 1, 20)
# bestValue =0
# best = 0
# for value in smoothing_values:
#     #print("smoothing : " + str(value))
#     clf = GaussianNB(var_smoothing=value)
#     clf.fit(X_train_fbow_2.toarray(), y_train_2)
#     y_pred = clf.predict(X_dev_fbow_2.toarray())
#     score = metrics.accuracy_score(y_dev_2, y_pred)
#     print("accuracy : " + str(score))
#     if score > best:
#         best = score
#         bestValue = value

        
# X_final = vectorizer_freq_2.transform(X_train_2 + X_dev_2)
# y_final = y_train_2 + y_dev_2
# clf = GaussianNB(var_smoothing=bestValue)
# clf.fit(X_final.toarray(), y_final)
# y_pred = clf.predict(X_test_fbow_2.toarray())
# score = metrics.accuracy_score(y_test_2, y_pred)
# print("best test result for smoothing : " + str(bestValue))
# print(score)

In [131]:
# # 5 grain classification
# #frequency unigram bag of words
# smoothing_values = np.linspace(1e-5, 1, 30)
# bestValue =0
# best = 0
# for value in smoothing_values:
#     print("smoothing : " + str(value))
#     clf = GaussianNB(var_smoothing=value)
#     clf.fit(X_train_fbow_5.toarray(), y_train_5)
#     y_pred = clf.predict(X_dev_fbow_5.toarray())
#     score = metrics.accuracy_score(y_dev_5, y_pred)
#     print("accuracy : " + str(score))
#     if score > best:
#         best = score
#         bestValue = value

# X_final = vectorizer_freq_5.transform(X_train_5 + X_dev_5)
# y_final = y_train_5 + y_dev_5
# clf = GaussianNB(var_smoothing=bestValue)
# clf.fit(X_final.toarray(), y_final)
# y_pred = clf.predict(X_test_fbow_5.toarray())
# score = metrics.accuracy_score(y_test_5, y_pred)
# print("best test result for smoothing : " + str(bestValue))
# print(score)

In [132]:
# 2 grain classification
vectorizer_2 = feature_extraction.text.CountVectorizer(ngram_range = (1,2), binary=True)
vectorizer_freq_2 = TfidfVectorizer(ngram_range = (1,2))
vectorizer_2.fit(X_train_2 + X_dev_2 + X_test_2)
vectorizer_freq_2.fit(X_train_2 + X_dev_2 + X_test_2)
#bag of words vectors
X_train_bow_2 = (vectorizer_2.transform(X_train_2))
X_dev_bow_2 = (vectorizer_2.transform(X_dev_2))
X_test_bow_2 = (vectorizer_2.transform(X_test_2))

# frequency bag of words vectors
X_train_fbow_2 = (vectorizer_freq_2.transform(X_train_2))
X_dev_fbow_2 = (vectorizer_freq_2.transform(X_dev_2))
X_test_fbow_2 = (vectorizer_freq_2.transform(X_test_2))


# for 5 grained classification
vectorizer_5 = feature_extraction.text.CountVectorizer(ngram_range = (1,2), binary=True)
vectorizer_freq_5 = TfidfVectorizer(ngram_range = (1,1))
vectorizer_5.fit(X_train_5 + X_dev_5 + X_test_5)
vectorizer_freq_5.fit(X_train_5 + X_dev_5 + X_test_5)
#bag of words vectors
X_train_bow_5 = (vectorizer_5.transform(X_train_5))
X_dev_bow_5 = (vectorizer_5.transform(X_dev_5))
X_test_bow_5 = (vectorizer_5.transform(X_test_5))

# frequency bag of words vectors
X_train_fbow_5 = (vectorizer_freq_5.transform(X_train_5))
X_dev_fbow_5 = (vectorizer_freq_5.transform(X_dev_5))
X_test_fbow_5 = (vectorizer_freq_5.transform(X_test_5))

In [133]:
# 2 grain classification, BBOW
# bigram bag of words
alpha_values = np.linspace(1e-5, 1, 2000)
bestAlpha = 0
bestValidError = 0
for alpha in alpha_values:
    clf = BernoulliNB(alpha=alpha)
    clf.fit(X_train_bow_2, y_train_2)
    y_pred = clf.predict(X_dev_bow_2)
    score = metrics.accuracy_score(y_dev_2, y_pred)
    if score>bestValidError :
        bestValidError = score
        bestAlpha = alpha

X_final = vectorizer_2.transform(X_train_2 + X_dev_2)
y_final = y_train_2 + y_dev_2
clf = BernoulliNB(alpha=bestAlpha)
clf.fit(X_final, y_final)
y_pred = clf.predict(X_test_bow_2)
score = metrics.accuracy_score(y_test_2, y_pred)
  
print(bestAlpha)
print(score)

0.6163119909954977
0.8264689730917079


In [134]:
# 5 grain classification, BBOW
# bigram bag of words
alpha_values = np.linspace(1e-5, 2, 2000)
bestAlpha = 0
bestValidError = 0
for alpha in alpha_values:
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X_train_bow_5, y_train_5)
    y_pred = clf.predict(X_dev_bow_5)
    score = metrics.accuracy_score(y_dev_5, y_pred)
    if score>bestValidError :
        bestValidError = score
        bestAlpha = alpha

X_final = vectorizer_5.transform(X_train_5 + X_dev_5)
y_final = y_train_5 + y_dev_5
clf = MultinomialNB(alpha=bestAlpha)
clf.fit(X_final, y_final)
y_pred = clf.predict(X_test_bow_5)
score = metrics.accuracy_score(y_test_5, y_pred)
  
print(bestAlpha)
print(score)

0.42922246123061536
0.39683257918552034


In [135]:
# # 2 grain classification
# #frequency bigram bag of words
# smoothing_values = np.linspace(1e-5, 1, 20)
# bestValue =0
# best = 0
# for value in smoothing_values:
#     #print("smoothing : " + str(value))
#     clf = GaussianNB(var_smoothing=value)
#     clf.fit(X_train_fbow_2.toarray(), y_train_2)
#     y_pred = clf.predict(X_dev_fbow_2.toarray())
#     score = metrics.accuracy_score(y_dev_2, y_pred)
#     print("accuracy : " + str(score))
#     if score > best:
#         best = score
#         bestValue = value

        
# X_final = vectorizer_freq_2.transform(X_train_2 + X_dev_2)
# y_final = y_train_2 + y_dev_2
# clf = GaussianNB(var_smoothing=bestValue)
# clf.fit(X_final.toarray(), y_final)
# y_pred = clf.predict(X_test_fbow_2.toarray())
# score = metrics.accuracy_score(y_test_2, y_pred)
# print("best test result for smoothing : " + str(bestValue))
# print(score)

In [136]:
# # 5 grain classification
# #frequency bigram bag of words
# smoothing_values = np.linspace(1e-5, 1, 30)
# bestValue =0
# best = 0
# for value in smoothing_values:
#     print("smoothing : " + str(value))
#     clf = GaussianNB(var_smoothing=value)
#     clf.fit(X_train_fbow_5.toarray(), y_train_5)
#     y_pred = clf.predict(X_dev_fbow_5.toarray())
#     score = metrics.accuracy_score(y_dev_5, y_pred)
#     print("accuracy : " + str(score))
#     if score > best:
#         best = score
#         bestValue = value

# X_final = vectorizer_freq_5.transform(X_train_5 + X_dev_5)
# y_final = y_train_5 + y_dev_5
# clf = GaussianNB(var_smoothing=bestValue)
# clf.fit(X_final.toarray(), y_final)
# y_pred = clf.predict(X_test_fbow_5.toarray())
# score = metrics.accuracy_score(y_test_5, y_pred)
# print("best test result for smoothing : " + str(bestValue))
# print(score)