In [19]:
from pathlib import Path
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import spacy
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score

#import our_metrics

TRAIN_FILE = Path("data/train/spanish_with_extra_english.txt")
TEST_FILE = Path("data/test/spanish_combine_test.txt")
#TEST_FILE = Path("data/test.tsv")

LABELS = [0,1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19]


In [20]:
def load_data_file(data_file):
    print("Loading from {} ...".format(data_file.name), end="")
    
    data = {}
    data["sentence"] = []
    data["label"] = []
    
    with open(data_file,"r",encoding="utf8") as f:
        lines=f.read().split('\n')
        for id in range(len(lines) -1):
            sent = lines[id].split('\t')[0]
            lab = lines[id].split('\t')[1]
            data["sentence"].append(sent)
            data["label"].append(lab)
    return data["sentence"], data["label"]

In [21]:
train_X,train_y = load_data_file(TRAIN_FILE)
test_X,test_y = load_data_file(TEST_FILE)
print(len(test_X))

Loading from spanish_with_extra_english.txt ...Loading from spanish_combine_test.txt ...1000


In [22]:
nb_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("bayes", ComplementNB()),],
    )

logistic_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("logistic", LogisticRegression()),],
    )


# Forest_pipeline = Pipeline(
#         steps = [("ngram",CountVectorizer()) , ("Forest", RandomForestClassifier(n_estimators=200)),],
#     )


In [23]:
for name, pipeline in (
    ["NB",nb_pipeline,],
   
    ):
    pipeline.fit(train_X,train_y)
    train_pred = pipeline.predict(train_X)
    test_pred = pipeline.predict(test_X)
    print(name,"\n","=" * len(name))
    
    for split, y, pred in [
        ("train", train_y, train_pred),
        ("test", test_y, test_pred),
    ]:
        print("For {} data".format(split))
        print(classification_report(y,pred))
        print()
    
    with open('data/test/spanish_combined_result_'+str(name)+'.prediction',mode='wt',encoding='utf-8') as myfile:
        myfile.write('\n'.join([str(x) for x in test_pred]))

NB 
 ==
For train data
              precision    recall  f1-score   support

           0       0.64      0.84      0.73     25523
           1       0.79      0.68      0.73     12744
          10       0.81      0.68      0.74      4617
          11       0.85      0.55      0.67      3972
          12       0.91      0.54      0.68      2978
          13       0.94      0.60      0.73      2738
          14       0.71      0.75      0.73       370
          15       0.72      0.69      0.70      5252
          16       0.55      0.90      0.68       455
          17       0.74      0.77      0.75       385
          18       0.92      0.61      0.74      2901
           2       0.66      0.88      0.76     12087
           3       0.85      0.52      0.65      6513
           4       0.88      0.58      0.70      5048
           5       0.89      0.57      0.69      3618
           6       0.53      0.94      0.67       652
           7       0.93      0.62      0.74      3476
    

In [25]:
!python scorer_semeval18.py data/test/spanish_test.labels data/test/spanish_combined_result_NB.prediction
#!python scorer_semeval18.py data/test/spanish_test.labels data/test/spanish_result_LR.prediction


Macro F-Score (official): 12.242
-----
Micro F-Score: 22.8
Precision: 22.8
Recall: 22.8
