In [13]:
from pathlib import Path
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import spacy
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score

#import our_metrics

TRAIN_FILE = Path("data/train/spanish_combine_train.txt")
TEST_FILE = Path("data/test/spanish_combine_test.txt")
#TEST_FILE = Path("data/test.tsv")

LABELS = [0,1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19]


In [14]:
def load_data_file(data_file):
    print("Loading from {} ...".format(data_file.name), end="")
    
    data = {}
    data["sentence"] = []
    data["label"] = []
    
    with open(data_file,"r",encoding="utf8") as f:
        lines=f.read().split('\n')
        for id in range(len(lines) -1):
            sent = lines[id].split('\t')[0]
            lab = lines[id].split('\t')[1]
            data["sentence"].append(sent)
            data["label"].append(lab)
    return data["sentence"], data["label"]

In [15]:
train_X,train_y = load_data_file(TRAIN_FILE)
test_X,test_y = load_data_file(TEST_FILE)
print(len(test_X))

Loading from spanish_combine_train.txt ...Loading from spanish_combine_test.txt ...1000


In [16]:
nb_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("bayes", ComplementNB()),],
    )

logistic_pipeline = Pipeline(
        steps = [("ngram",CountVectorizer()) , ("logistic", LogisticRegression()),],
    )


# Forest_pipeline = Pipeline(
#         steps = [("ngram",CountVectorizer()) , ("Forest", RandomForestClassifier(n_estimators=200)),],
#     )


In [17]:
for name, pipeline in (
    ["NB",nb_pipeline,],
    ["LR",logistic_pipeline]
    ):
    pipeline.fit(train_X,train_y)
    train_pred = pipeline.predict(train_X)
    test_pred = pipeline.predict(test_X)
    print(name,"\n","=" * len(name))
    
    for split, y, pred in [
        ("train", train_y, train_pred),
        ("test", test_y, test_pred),
    ]:
        print("For {} data".format(split))
        print(classification_report(y,pred))
        print()
    
    with open('data/test/spanish_result_'+str(name)+'.prediction',mode='wt',encoding='utf-8') as myfile:
        myfile.write('\n'.join([str(x) for x in test_pred]))

NB 
 ==
For train data
              precision    recall  f1-score   support

           0       0.71      0.90      0.80      3965
           1       0.87      0.81      0.84      2635
          10       0.93      0.80      0.86       572
          11       0.94      0.68      0.79       627
          12       0.94      0.66      0.78       518
          13       0.95      0.79      0.86       514
          14       0.95      0.65      0.77       370
          15       0.90      0.76      0.83       609
          16       0.91      0.87      0.89       455
          17       0.92      0.64      0.76       385
          18       0.95      0.77      0.85       442
           2       0.80      0.94      0.87      2312
           3       0.90      0.69      0.78      1023
           4       0.90      0.78      0.83      1118
           5       0.91      0.72      0.81       772
           6       0.88      0.89      0.89       652
           7       0.94      0.82      0.88       793
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR 
 ==
For train data
              precision    recall  f1-score   support

           0       0.66      0.96      0.78      3965
           1       0.83      0.87      0.85      2635
          10       0.96      0.76      0.85       572
          11       0.97      0.63      0.76       627
          12       0.98      0.61      0.75       518
          13       0.98      0.74      0.84       514
          14       0.98      0.59      0.73       370
          15       0.94      0.76      0.84       609
          16       0.96      0.84      0.90       455
          17       0.98      0.52      0.68       385
          18       0.99      0.76      0.86       442
           2       0.86      0.95      0.90      2312
           3       0.92      0.69      0.79      1023
           4       0.91      0.79      0.85      1118
           5       0.91      0.71      0.80       772
           6       0.95      0.87      0.91       652
           7       0.96      0.82      0.89       793
    

In [18]:
!python scorer_semeval18.py data/test/spanish_test.labels data/test/spanish_result_NB.prediction
!python scorer_semeval18.py data/test/spanish_test.labels data/test/spanish_result_LR.prediction


Macro F-Score (official): 12.763
-----
Micro F-Score: 26.6
Precision: 26.6
Recall: 26.6
Macro F-Score (official): 12.548
-----
Micro F-Score: 30.5
Precision: 30.5
Recall: 30.5
