In [1]:
import numpy as np
import pandas as pd
train=pd.read_csv('./input/train.tsv', sep='\t')
test=pd.read_csv('./input/test.tsv', sep='\t')

In [2]:
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [3]:
from nltk.stem import WordNetLemmatizer
import re
lemmatizer = WordNetLemmatizer() 
def clean_data(text):
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", " ",text)
    text = re.sub("-"," ",text)
    tokens = text.lower().split()
    filtered_tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(filtered_tokens)
train_data = train['Phrase'].apply(clean_data)
test_data = test['Phrase'].apply(clean_data)

In [4]:
# coding:utf-8
import numpy as np

class Ngram():
    def __init__(self, max_features=None, ngram=1):
        self.max_features = max_features #最大特征数
        self.ngram = ngram

    def getngrams(self,text, n):
        output = {}
        for sentence in text:
            tokens = sentence.split()
            for i in range(len(tokens)+1-n):  #遍历查找n元组
                ngramTemp = " ".join(tokens[i:i+n])
                if ngramTemp in output:
                    output[ngramTemp] += 1
                else: output[ngramTemp] = 0        #统计词频
        return output
    
    def fit(self, data):
        self.vocabulary_ = {}
        self.vocabulary_.update(self.getngrams(data,self.ngram))   #构造词表
        if self.max_features:
            #排序，然后保留前max_features个词组
            vocab = sorted(self.vocabulary_.items(),key = lambda x:x[1],reverse = True)
            for key, value in vocab[self.max_features:len(self.vocabulary_)]:
                del self.vocabulary_[str(key)]
        #编码
        label = 0
        for i in self.vocabulary_.keys():
            self.vocabulary_[i] = label
            label += 1
    
    def transform(self, data):
        array = np.zeros([len(data),len(self.vocabulary_)])
        n = len(data)
        for i in range(n):
            tokens = data[i].split()
            for j in range(len(tokens)+1-self.ngram):     #遍历查找n元组
                ngramTemp = " ".join(tokens[j:j+self.ngram])
                if ngramTemp in self.vocabulary_.keys():
                    array[i][self.vocabulary_[ngramTemp]] = 1

        return array

In [5]:
train_text= np.zeros((len(train_data),3000))
test_text = np.zeros((len(test_data),3000))
for i in range(1,4):
    vec = Ngram(1000, i)
    vec.fit(list(train_data)+list(test_data))
    train_text[:,1000*(i-1):1000*i] = vec.transform(train_data)
    test_text[:,1000*(i-1):1000*i] = vec.transform(test_data)

In [6]:
import random
tmp = list(range(0,len(train_data)))
mask = random.sample(tmp, 16060)
val_text = train_text[mask]
train_text = np.delete(train_text,mask,axis=0)
y = np.array(train['Sentiment'])
y_val = y[mask]
y_train = np.delete(y,mask,axis = 0)

## Softmax(cross entropy loss function)

In [7]:
from softmax import Softmax
results = {}
best_val = -1
best_softmax = None
learning_rates = [1e-1, 3e-2, 1e-2, 3e-3, 1e-3]
features = [0,1,2,3]


for i in features:
    X_train = np.copy(train_text)
    X_val = np.copy(val_text)
    if i == 0:
        pass
    else: 
        X_train = np.delete(train_text,list(range(1000*(i-1),1000*i)),axis = 1)
        X_val = np.delete(val_text,list(range(1000*(i-1),1000*i)), axis = 1)
        
    #添加bias
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
    for lr in learning_rates:
        model = Softmax()
        model.train(X_train, y_train, learning_rate = lr, num_iters = 4000)
        
        y_pred_train = model.predict(X_train)
        acc_train = np.mean(y_pred_train == y_train)
        
        y_pred_val = model.predict(X_val)
        acc_val = np.mean(y_pred_val == y_val)
        results[(lr,i)] = (acc_train, acc_val)
        
        if acc_val > best_val:
            best_val = acc_val
            best_softmax = model

iteration 0 / 4000: loss 1.608666
iteration 100 / 4000: loss 1.271652
iteration 200 / 4000: loss 1.166203
iteration 300 / 4000: loss 1.237849
iteration 400 / 4000: loss 1.193581
iteration 500 / 4000: loss 1.241890
iteration 600 / 4000: loss 1.151648
iteration 700 / 4000: loss 1.185860
iteration 800 / 4000: loss 1.299731
iteration 900 / 4000: loss 1.239409
iteration 1000 / 4000: loss 1.167822
iteration 1100 / 4000: loss 1.157637
iteration 1200 / 4000: loss 1.092439
iteration 1300 / 4000: loss 1.222840
iteration 1400 / 4000: loss 1.213583
iteration 1500 / 4000: loss 1.208075
iteration 1600 / 4000: loss 1.166120
iteration 1700 / 4000: loss 1.167774
iteration 1800 / 4000: loss 1.125052
iteration 1900 / 4000: loss 1.188323
iteration 2000 / 4000: loss 1.205866
iteration 2100 / 4000: loss 1.214412
iteration 2200 / 4000: loss 1.082307
iteration 2300 / 4000: loss 1.231721
iteration 2400 / 4000: loss 1.181714
iteration 2500 / 4000: loss 1.128529
iteration 2600 / 4000: loss 1.150695
iteration 270

iteration 2400 / 4000: loss 1.240170
iteration 2500 / 4000: loss 1.284151
iteration 2600 / 4000: loss 1.323291
iteration 2700 / 4000: loss 1.275909
iteration 2800 / 4000: loss 1.245520
iteration 2900 / 4000: loss 1.322413
iteration 3000 / 4000: loss 1.316556
iteration 3100 / 4000: loss 1.302858
iteration 3200 / 4000: loss 1.256686
iteration 3300 / 4000: loss 1.310374
iteration 3400 / 4000: loss 1.327240
iteration 3500 / 4000: loss 1.304642
iteration 3600 / 4000: loss 1.209307
iteration 3700 / 4000: loss 1.243529
iteration 3800 / 4000: loss 1.190704
iteration 3900 / 4000: loss 1.204877
iteration 0 / 4000: loss 1.610005
iteration 100 / 4000: loss 1.365530
iteration 200 / 4000: loss 1.328890
iteration 300 / 4000: loss 1.249497
iteration 400 / 4000: loss 1.250387
iteration 500 / 4000: loss 1.287183
iteration 600 / 4000: loss 1.282721
iteration 700 / 4000: loss 1.303103
iteration 800 / 4000: loss 1.240164
iteration 900 / 4000: loss 1.259386
iteration 1000 / 4000: loss 1.337995
iteration 110

iteration 800 / 4000: loss 1.196317
iteration 900 / 4000: loss 1.247164
iteration 1000 / 4000: loss 1.240033
iteration 1100 / 4000: loss 1.213128
iteration 1200 / 4000: loss 1.298701
iteration 1300 / 4000: loss 1.288392
iteration 1400 / 4000: loss 1.259814
iteration 1500 / 4000: loss 1.222061
iteration 1600 / 4000: loss 1.185449
iteration 1700 / 4000: loss 1.212591
iteration 1800 / 4000: loss 1.216536
iteration 1900 / 4000: loss 1.279856
iteration 2000 / 4000: loss 1.168647
iteration 2100 / 4000: loss 1.210583
iteration 2200 / 4000: loss 1.203901
iteration 2300 / 4000: loss 1.200515
iteration 2400 / 4000: loss 1.243652
iteration 2500 / 4000: loss 1.201535
iteration 2600 / 4000: loss 1.266821
iteration 2700 / 4000: loss 1.159557
iteration 2800 / 4000: loss 1.215411
iteration 2900 / 4000: loss 1.164414
iteration 3000 / 4000: loss 1.254312
iteration 3100 / 4000: loss 1.256467
iteration 3200 / 4000: loss 1.267816
iteration 3300 / 4000: loss 1.143886
iteration 3400 / 4000: loss 1.189507
ite

iteration 3200 / 4000: loss 1.203861
iteration 3300 / 4000: loss 1.185988
iteration 3400 / 4000: loss 1.158934
iteration 3500 / 4000: loss 1.237348
iteration 3600 / 4000: loss 1.159091
iteration 3700 / 4000: loss 1.182445
iteration 3800 / 4000: loss 1.204650
iteration 3900 / 4000: loss 1.169337
iteration 0 / 4000: loss 1.609801
iteration 100 / 4000: loss 1.490317
iteration 200 / 4000: loss 1.401683
iteration 300 / 4000: loss 1.405342
iteration 400 / 4000: loss 1.347737
iteration 500 / 4000: loss 1.334167
iteration 600 / 4000: loss 1.367468
iteration 700 / 4000: loss 1.267551
iteration 800 / 4000: loss 1.263371
iteration 900 / 4000: loss 1.205979
iteration 1000 / 4000: loss 1.340742
iteration 1100 / 4000: loss 1.231666
iteration 1200 / 4000: loss 1.265848
iteration 1300 / 4000: loss 1.248696
iteration 1400 / 4000: loss 1.325587
iteration 1500 / 4000: loss 1.233899
iteration 1600 / 4000: loss 1.272338
iteration 1700 / 4000: loss 1.211354
iteration 1800 / 4000: loss 1.260487
iteration 190

In [9]:
import pandas as pd
softmax_df = pd.DataFrame(np.zeros((4,5)),index = ['N1,N2,N3','N2,N3','N1,N3','N1,N2'],columns = learning_rates)
feature_dict = {
    0:'N1,N2,N3',
    1:'N2,N3',
    2:'N1,N3',
    3:'N1,N2'
}
for lr, gram in sorted(results):
    train_accuracy, val_accuracy = results[(lr, gram)]
    softmax_df[lr][feature_dict[gram]] = val_accuracy
    
print('best validation accuracy achieved during validation: %f' % best_val)

best validation accuracy achieved during validation: 0.546887


In [10]:
softmax_df

Unnamed: 0,0.1,0.03,0.01,0.003,0.001
"N1,N2,N3",0.546887,0.53269,0.522354,0.511395,0.510212
"N2,N3",0.519738,0.512267,0.510212,0.510212,0.510212
"N1,N3",0.54589,0.530262,0.520486,0.511083,0.510212
"N1,N2",0.545455,0.532379,0.52279,0.511208,0.510212


In [11]:
X_test = np.copy(test_text)
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
test_predictions = best_softmax.predict(X_test)

In [12]:
output = pd.DataFrame({
    'PhraseId': test['PhraseId'],
    'Sentiment': test_predictions
})

output.to_csv('softmax_submission.csv', index=False)

上传至kaggle后，得分为0.55050

## SVM(hinge loss function)

In [13]:
from svm import SVM
results = {}
best_val = -1
best_svm = None
learning_rates = [1e-1, 3e-2, 1e-2, 3e-3, 1e-3]
features = [0,1,2,3]


for i in features:
    X_train = np.copy(train_text)
    X_val = np.copy(val_text)
    if i == 0:
        pass
    else: 
        X_train = np.delete(train_text,list(range(1000*(i-1),1000*i)),axis = 1)
        X_val = np.delete(val_text,list(range(1000*(i-1),1000*i)), axis = 1)
        
    #添加bias
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
    for lr in learning_rates:
        model = SVM()
        
        model.train(X_train, y_train, learning_rate = lr, num_iters = 4000)
        
        y_pred_train = model.predict(X_train)
        acc_train = np.mean(y_pred_train == y_train)
        
        y_pred_val = model.predict(X_val)
        acc_val = np.mean(y_pred_val == y_val)
        results[(lr, i)] = (acc_train, acc_val)
        
        if acc_val > best_val:
            best_val = acc_val
            best_svm = model

iteration 0 / 4000: loss 4.003538
iteration 100 / 4000: loss 2.111532
iteration 200 / 4000: loss 1.938536
iteration 300 / 4000: loss 1.876652
iteration 400 / 4000: loss 1.649497
iteration 500 / 4000: loss 1.846131
iteration 600 / 4000: loss 1.688485
iteration 700 / 4000: loss 1.648000
iteration 800 / 4000: loss 1.756324
iteration 900 / 4000: loss 1.853997
iteration 1000 / 4000: loss 1.789751
iteration 1100 / 4000: loss 1.668364
iteration 1200 / 4000: loss 1.586392
iteration 1300 / 4000: loss 1.892216
iteration 1400 / 4000: loss 1.684077
iteration 1500 / 4000: loss 1.649654
iteration 1600 / 4000: loss 1.729871
iteration 1700 / 4000: loss 1.459974
iteration 1800 / 4000: loss 1.643731
iteration 1900 / 4000: loss 1.866728
iteration 2000 / 4000: loss 1.686098
iteration 2100 / 4000: loss 1.659622
iteration 2200 / 4000: loss 1.632813
iteration 2300 / 4000: loss 1.791769
iteration 2400 / 4000: loss 1.708221
iteration 2500 / 4000: loss 1.448448
iteration 2600 / 4000: loss 1.493874
iteration 270

iteration 2400 / 4000: loss 1.859196
iteration 2500 / 4000: loss 2.215275
iteration 2600 / 4000: loss 2.077889
iteration 2700 / 4000: loss 2.003380
iteration 2800 / 4000: loss 1.917269
iteration 2900 / 4000: loss 1.984747
iteration 3000 / 4000: loss 1.592428
iteration 3100 / 4000: loss 1.927647
iteration 3200 / 4000: loss 1.942808
iteration 3300 / 4000: loss 1.910688
iteration 3400 / 4000: loss 1.893014
iteration 3500 / 4000: loss 1.630606
iteration 3600 / 4000: loss 1.614956
iteration 3700 / 4000: loss 2.196311
iteration 3800 / 4000: loss 2.169881
iteration 3900 / 4000: loss 1.853733
iteration 0 / 4000: loss 3.999825
iteration 100 / 4000: loss 1.770160
iteration 200 / 4000: loss 2.045438
iteration 300 / 4000: loss 1.696206
iteration 400 / 4000: loss 1.991379
iteration 500 / 4000: loss 1.854892
iteration 600 / 4000: loss 2.017542
iteration 700 / 4000: loss 2.250896
iteration 800 / 4000: loss 2.081184
iteration 900 / 4000: loss 1.908149
iteration 1000 / 4000: loss 1.976040
iteration 110

iteration 800 / 4000: loss 2.128423
iteration 900 / 4000: loss 1.713053
iteration 1000 / 4000: loss 1.796730
iteration 1100 / 4000: loss 1.869494
iteration 1200 / 4000: loss 1.542984
iteration 1300 / 4000: loss 1.741992
iteration 1400 / 4000: loss 1.752513
iteration 1500 / 4000: loss 1.749299
iteration 1600 / 4000: loss 1.694660
iteration 1700 / 4000: loss 2.012653
iteration 1800 / 4000: loss 1.845588
iteration 1900 / 4000: loss 2.028444
iteration 2000 / 4000: loss 1.800285
iteration 2100 / 4000: loss 2.108757
iteration 2200 / 4000: loss 1.672702
iteration 2300 / 4000: loss 1.715146
iteration 2400 / 4000: loss 2.008560
iteration 2500 / 4000: loss 1.636850
iteration 2600 / 4000: loss 1.598573
iteration 2700 / 4000: loss 1.573128
iteration 2800 / 4000: loss 1.659121
iteration 2900 / 4000: loss 1.533163
iteration 3000 / 4000: loss 1.810785
iteration 3100 / 4000: loss 1.753007
iteration 3200 / 4000: loss 1.827065
iteration 3300 / 4000: loss 1.672335
iteration 3400 / 4000: loss 1.825791
ite

iteration 3200 / 4000: loss 2.084035
iteration 3300 / 4000: loss 1.622719
iteration 3400 / 4000: loss 1.508672
iteration 3500 / 4000: loss 1.628266
iteration 3600 / 4000: loss 1.798742
iteration 3700 / 4000: loss 2.025379
iteration 3800 / 4000: loss 1.691906
iteration 3900 / 4000: loss 1.707239
iteration 0 / 4000: loss 4.002244
iteration 100 / 4000: loss 2.129398
iteration 200 / 4000: loss 2.088004
iteration 300 / 4000: loss 2.123487
iteration 400 / 4000: loss 2.078074
iteration 500 / 4000: loss 1.865843
iteration 600 / 4000: loss 1.737386
iteration 700 / 4000: loss 2.060316
iteration 800 / 4000: loss 2.038479
iteration 900 / 4000: loss 2.032590
iteration 1000 / 4000: loss 1.945288
iteration 1100 / 4000: loss 2.131102
iteration 1200 / 4000: loss 1.831093
iteration 1300 / 4000: loss 1.994791
iteration 1400 / 4000: loss 2.086016
iteration 1500 / 4000: loss 2.167119
iteration 1600 / 4000: loss 1.798636
iteration 1700 / 4000: loss 1.923999
iteration 1800 / 4000: loss 1.891363
iteration 190

In [14]:
svm_df = pd.DataFrame(np.zeros((4,5)),index = ['N1,N2,N3','N2,N3','N1,N3','N1,N2'],columns = learning_rates)
feature_dict = {
    0:'N1,N2,N3',
    1:'N2,N3',
    2:'N1,N3',
    3:'N1,N2'
}
for lr, gram in sorted(results):
    train_accuracy, val_accuracy = results[(lr, gram)]
    svm_df[lr][feature_dict[gram]] = val_accuracy
    
print('best validation accuracy achieved during validation: %f' % best_val)

best validation accuracy achieved during validation: 0.567746


In [15]:
svm_df

Unnamed: 0,0.1,0.03,0.01,0.003,0.001
"N1,N2,N3",0.567746,0.54396,0.526588,0.510523,0.510212
"N2,N3",0.52528,0.514633,0.510149,0.510212,0.510212
"N1,N3",0.563823,0.541096,0.523412,0.510212,0.510212
"N1,N2",0.565131,0.541594,0.526961,0.510274,0.510212


In [16]:
X_test = np.copy(test_text)
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
test_predictions = best_svm.predict(X_test)

In [17]:
output = pd.DataFrame({
    'PhraseId': test['PhraseId'],
    'Sentiment': test_predictions
})

output.to_csv('svm_submission.csv', index=False)

上传至kaggle后，得分为0.56533