In [1]:
from pathlib import Path
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import spacy
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score

#import our_metrics

TRAIN_FILE = Path("data/train/english_combine_train.txt")
TEST_FILE = Path("data/test/english_combine_test.txt")
#TEST_FILE = Path("data/test.tsv")

LABELS = [0,1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19]


In [2]:
def load_data_file(data_file):
    print("Loading from {} ...".format(data_file.name), end="")
    
    data = {}
    data["sentence"] = []
    data["label"] = []
    
    with open(data_file,"r",encoding="utf8") as f:
        lines=f.read().split('\n')
        for id in range(len(lines) -1):
            sent = lines[id].split('\t')[0]
            lab = lines[id].split('\t')[1]
            data["sentence"].append(sent)
            data["label"].append(lab)
    return data["sentence"], data["label"]

In [3]:
train_X,train_y = load_data_file(TRAIN_FILE)
test_X,test_y = load_data_file(TEST_FILE)
print(len(test_X))

Loading from english_combine_train.txt ...Loading from english_combine_test.txt ...10000


In [4]:
nb_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("bayes", ComplementNB()),],
    )

logistic_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("logistic", LogisticRegression()),],
    )

SVM_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("SVM", SVC()),],
    )

Forest_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("Forest", RandomForestClassifier(n_estimators=200)),],
    )


In [5]:
for name, pipeline in (
    ["NB",nb_pipeline,],
    ["LR",logistic_pipeline,],
    #["SVM",SVM_pipeline,],
    ["RandomForest",Forest_pipeline,],
    ):
    pipeline.fit(train_X,train_y)
    train_pred = pipeline.predict(train_X)
    test_pred = pipeline.predict(test_X)
    print(name,"\n","=" * len(name))
    
    for split, y, pred in [
        ("train", train_y, train_pred),
        ("test", test_y, test_pred),
    ]:
        print("For {} data".format(split))
        print(classification_report(y,pred))
        print()
    
    with open('data/test/english_result_'+str(name)+'.prediction',mode='wt',encoding='utf-8') as myfile:
        myfile.write('\n'.join([str(x) for x in test_pred]))

NB 
 ==
For train data
              precision    recall  f1-score   support

           0       0.64      0.82      0.72     19378
           1       0.77      0.66      0.71      9108
          10       0.90      0.76      0.82      2659
          11       0.66      0.85      0.74      3106
          12       0.58      0.79      0.67      2466
          13       0.91      0.53      0.67      2238
          14       0.93      0.58      0.72      2390
          15       0.89      0.59      0.71      2238
          16       0.93      0.58      0.72      2224
          17       0.64      0.91      0.75      2538
          18       0.83      0.77      0.80      3251
          19       0.93      0.57      0.71      2023
           2       0.61      0.86      0.71      8887
           3       0.83      0.50      0.63      4978
           4       0.71      0.84      0.77      5504
           5       0.89      0.53      0.66      3621
           6       0.85      0.64      0.73      3653
    

In [6]:
!python scorer_semeval18.py data/test/english_test.labels data/test/english_result_NB.prediction
!python scorer_semeval18.py data/test/english_test.labels data/test/english_result_LR.prediction
!python scorer_semeval18.py data/test/english_test.labels data/test/english_result_RandomForest.prediction

Macro F-Score (official): 20.753
-----
Micro F-Score: 31.15
Precision: 31.15
Recall: 31.15
