In [1]:
import pickle

In [2]:
with open('argument_words.pickle', 'rb') as handle:
    common_words = pickle.load(handle)

## finding the most_common words

In [3]:
from nltk import FreqDist

In [4]:
def freq_words(number, words):
    '''
    return the words which the frequency is higher than the given number
    number: the lowest value of frequency
    words: the cleared words list
    '''
    word_freq = FreqDist(words).most_common(1000)
    words_list = [key for (key, value) in word_freq if value > number]
    return words_list

In [5]:
input_words = common_words['all']
# word_list = freq_words(28, input_words)
word_list = freq_words(10, input_words)

## input the labeled sents 

In [6]:
with open('argument_words.pickle', 'rb') as handle:
    common_words = pickle.load(handle)

In [9]:
with open('labeled_essay_dics.pickle', 'rb') as handle:
    label_sents = pickle.load(handle)

In [10]:
# compute and store and the {sents: label} into one dic
all_label_sents = {}
for key in label_sents.keys():
    this_essay = label_sents[key]
    for s in this_essay.keys():
        all_label_sents[s] = this_essay[s]

## sents cleaning 

In [11]:
import re
import itertools
from collections import Counter

In [12]:
"""
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
"""


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [13]:
def clear_split_str(s):
    strip_s = s.strip()
    clear_s = clean_str(strip_s)
    s_text = clear_s.split(" ")
    return s_text

## Buiding bag-of-words vector

In [14]:
def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW

In [15]:
X = []
y = []
for key in all_label_sents.keys():
    new_key = []
    for w in clear_split_str(key):
        if w in word_list:
            new_key.append(w)
    X.append(get_BOW(new_key))
    y.append(all_label_sents[key])

In [16]:
print (y[1:10])

['Premise', 'Empty', 'Premise', 'Premise', 'Empty', 'Premise', 'Empty', 'Empty', 'MajorClaim']


## vectorize the data 

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction import DictVectorizer

In [18]:
vectorizer = DictVectorizer()

In [19]:
X_vector = vectorizer.fit_transform(X)

In [20]:
s = "i am ok"

## train test split

In [21]:
from sklearn.cross_validation import train_test_split



In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_vector, 
                                                   y,
                                                   test_size = 0.2
                                                   )

In [23]:
print ((X_train.shape))
print ((X_test.shape))
print (len(y_train))
print (len(y_test))

(5661, 1000)
(1416, 1000)
5661
1416


In [24]:
print (FreqDist(y_test).most_common(5))

[('Premise', 683), ('Empty', 316), ('Claim', 273), ('MajorClaim', 144)]


In [25]:
print (float(719/(719+302+276+119)))

0.507768361581921


# Models

## MultinomialNB

In [26]:
print ("the (alpha, accuracy) pairs in MultinomialNB:")
i = 0.1 
best_accuracy = 0.0
best_alpha = 0.0

for alpha_value in range(1,30):
    clf = MultinomialNB(alpha=alpha_value*0.1)
    clf.fit(X_train, y_train)
    prediciton = clf.predict(X_test)
    accuracy = accuracy_score(y_test,prediciton)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_alpha = alpha_value*0.1
    print (alpha_value*0.1, accuracy)

the (alpha, accuracy) pairs in MultinomialNB:
0.1 0.60593220339
0.2 0.608050847458
0.30000000000000004 0.608757062147
0.4 0.608757062147
0.5 0.608757062147
0.6000000000000001 0.605225988701
0.7000000000000001 0.608050847458
0.8 0.611581920904
0.9 0.612994350282
1.0 0.612994350282
1.1 0.61511299435
1.2000000000000002 0.613700564972
1.3 0.612994350282
1.4000000000000001 0.612288135593
1.5 0.610875706215
1.6 0.611581920904
1.7000000000000002 0.612288135593
1.8 0.610875706215
1.9000000000000001 0.608757062147
2.0 0.608757062147
2.1 0.610875706215
2.2 0.610875706215
2.3000000000000003 0.612288135593
2.4000000000000004 0.612994350282
2.5 0.613700564972
2.6 0.613700564972
2.7 0.61581920904
2.8000000000000003 0.614406779661
2.9000000000000004 0.611581920904


In [27]:
print (FreqDist(prediciton).most_common(5))

[('Premise', 836), ('Empty', 243), ('Claim', 192), ('MajorClaim', 145)]


## LogisticRegression

In [28]:
print ("the (alpha, accuracy) pairs in LogisticRegression:")
best_accuracy = 0.0
best_C = 0.0
for C_value in range(1,30):
    clf1 = LogisticRegression(C=C_value*0.1)
    clf1.fit(X_train, y_train)
    prediciton2 = clf1.predict(X_test)
    accuracy = accuracy_score(y_test,prediciton2)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C_value*0.1
    print (C_value*0.1, accuracy)

the (alpha, accuracy) pairs in LogisticRegression:
0.1 0.635593220339
0.2 0.636299435028
0.30000000000000004 0.641242937853
0.4 0.637711864407
0.5 0.641949152542
0.6000000000000001 0.639830508475
0.7000000000000001 0.637005649718
0.8 0.638418079096
0.9 0.633474576271
1.0 0.63418079096
1.1 0.63488700565
1.2000000000000002 0.63418079096
1.3 0.635593220339
1.4000000000000001 0.635593220339
1.5 0.636299435028
1.6 0.63488700565
1.7000000000000002 0.63418079096
1.8 0.632062146893
1.9000000000000001 0.632768361582
2.0 0.629943502825
2.1 0.628531073446
2.2 0.627824858757
2.3000000000000003 0.626412429379
2.4000000000000004 0.626412429379
2.5 0.625
2.6 0.624293785311
2.7 0.623587570621
2.8000000000000003 0.622175141243
2.9000000000000004 0.621468926554


In [29]:
print (FreqDist(prediciton2).most_common(5))

[('Premise', 780), ('Empty', 290), ('Claim', 220), ('MajorClaim', 126)]


In [31]:
# with open('logisticRegression_model.pickle', 'wb') as handle:
#     pickle.dump([clf1, X_test, y_test], handle, protocol=pickle.HIGHEST_PROTOCOL)

## SVM 

In [32]:
from sklearn import svm

In [33]:
# clf = svm.SVC(verbose=True)
clf3 = svm.LinearSVC()

In [34]:
clf3.fit(X_train, y_train) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [35]:
prediciton3 = clf3.predict(X_test)

In [36]:
accuracy = accuracy_score(y_test,prediciton3)
print (accuracy)

0.60593220339


In [37]:
print (FreqDist(prediciton3).most_common(5))

[('Premise', 755), ('Empty', 299), ('Claim', 221), ('MajorClaim', 141)]


In [38]:
import pandas as pd
comment_path = 'comment_sent.csv'
label_comments_data = pd.read_csv(comment_path,encoding = "ISO-8859-1")
label_sents = label_comments_data['sentence']
label_components = label_comments_data['label']

In [39]:
# print (label_components)

In [40]:
sents_vector = [clear_split_str(item) for item in label_sents]    

In [41]:
new_X = []
for key in sents_vector:
    new_key = []
    for w in key:
        if w in word_list:
            new_key.append(w)
    new_X.append(get_BOW(new_key))

In [42]:
# print (new_X)

In [43]:
new_X_vector_label = vectorizer.fit_transform(new_X)

In [44]:

new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X_vector_label, 
                                                   label_components,
                                                   test_size = 0.2
                                                   )

In [45]:
clf3.fit(new_X_train, new_y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [46]:
print (new_X_train.shape, new_X_test.shape)

(79, 345) (20, 345)


In [None]:
accuracy = accuracy_score(y_test,prediciton3)
print (accuracy)