In [2]:
import pandas as pd
import os
import io
import numpy as np
import numpy as np
import re
import time
import pandas as pd
import string
from collections import Counter, defaultdict
from sklearn import metrics
import xml.etree.ElementTree as ET
from sklearn.svm import LinearSVC
import nltk
from nltk.stem.snowball import FrenchStemmer 
from nltk.corpus import stopwords
from sklearn.calibration import CalibratedClassifierCV
from unicodedata import normalize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob
import pickle

In [9]:
def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None

def read_w2v(language='english'):
    if(language == 'french'):
        path = 'wiki.multi.fr.vec'
    elif(language == 'english'):
        path = 'wiki.multi.en.vec'
    elif(language == 'german'):
        path = 'wiki.multi.de.vec'
    t0 = time.time()
    w2v = {}
    count = 0
    
    with open(path, "r", encoding="utf8") as lines:
        for line in lines:
            lineArr = line.split()
            if(count!=0):
                x = []
                for value in lineArr[len(lineArr)-300:]:
                        x.append(float(value))
                w2v[' '.join(lineArr[0:len(lineArr)-300])]=  np.array(x)
            count+=1
    print(count)
    print(time.time()-t0)
    return w2v

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

negation = {'arent','isnt','wasnt','werent','cant','couldnt','mustnt','shouldnt','wont','wouldnt','didnt','doesnt','dont','hasnt','havent','hadnt'}

def tokenize(doc):    
    doc = doc.lower()
    token_list = doc.split()
    tokenized_list=[]
    for token in token_list:
        new_token=''
        for i in range(0,len(token)):
            if(token[i] not in string.punctuation):
                new_token+=token[i]
        tokenized_list.append(new_token)
    return tokenized_list

def cleaningTextTokenizing(line, lang = 'english'):
    if (lang == 'english'):
        re_print = re.compile('[^%s]' % re.escape(string.printable))
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        line = tokenize(line)
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
    else:
        line = tokenize(line)
        line = [word for word in line if word.isalpha()]
    return line

# def data_parse(parsed_xml_data, language):
#     X_data = []
#     y_data = []
# #     X_refer = []
#     for node in parsed_xml_data.getroot():
#         try:
#             summary = node.find('summary')
#             rating = node.find('rating')
#             text = node.find('text')
#             tokens_summary = getvalueofnode(summary)
#             tokens_text = getvalueofnode(text)
#             if(tokens_summary == None and tokens_text == None ):
#                 tokens = []
#             elif(tokens_text == None):
#                 tokens = cleaningTextTokenizing(tokens_summary,language)
#             elif(tokens_summary == None):
#                 tokens = cleaningTextTokenizing(tokens_text,language)
#             else:
#                 tokens = [*cleaningTextTokenizing(tokens_summary,language), *cleaningTextTokenizing(tokens_text,language)]

#             lemmaStr = get_lemmatized_text(tokens)
#             X_data.append(lemmaStr)
#     #     X_refer.append(tokens_summary + " " + tokens_text)
#             if(float(getvalueofnode(rating))>3):
#                 y_data.append('positive')
#             else:
#                 y_data.append('negative')
#         except:
#             print('translation error')
#     return X_data, y_data

# Stop word removal of tokenized input data
def get_lemmatized_text(tokenized_review):
    lemmatizer = WordNetLemmatizer()
    lemmatizedStr = []
    for word in tokenized_review:
        lemmatizedStr.append(lemmatizer.lemmatize(word))
    return lemmatizedStr

def data_parse(parsed_xml_data, language):
    X_data = []
    y_data = []
#     X_refer = []
    for node in parsed_xml_data.getroot():
        
        summary = node.find('summary')
        rating = node.find('rating')
        text = node.find('text')
        tokens_summary = getvalueofnode(summary)
        tokens_text = getvalueofnode(text)
        if(tokens_summary == None and tokens_text == None ):
            tokens = []
        elif(tokens_text == None):
            tokens = cleaningTextTokenizing(tokens_summary,language)
        elif(tokens_summary == None):
            tokens = cleaningTextTokenizing(tokens_text,language)
        else:
            tokens = [*cleaningTextTokenizing(tokens_summary,language), *cleaningTextTokenizing(tokens_text,language)]
        
        lemmaStr = get_lemmatized_text(tokens)
        X_data.append(lemmaStr)
#     X_refer.append(tokens_summary + " " + tokens_text)
        if(float(getvalueofnode(rating))>3):
            y_data.append('positive')
        else:
            y_data.append('negative')
        
    return X_data, y_data

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x, min_df = 0.0005)
        tfidf.fit(X)
#         print ('Vocab Size' , len(tfidf.vocabulary_))
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self,tfidf.vocabulary_.items()

    def transform(self, X):
        np_ar = np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])
        return np_ar
    
class CountEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X):
        count_vect = CountVectorizer(analyzer=lambda x: x, min_df = 0.0005)
        count_vect.fit(X)
        print ('Vocab Size' , len(count_vect.vocabulary_))
        return self,count_vect.vocabulary_.items()

#     def transform(self, X):
#         doc2vec = []
#         for words in X:
#             vec2mean = []
#             for w in set(words):
#                 if w in self.word2vec:
#                     weighted_word = self.word2vec[w]
#                 else:
#                     weighted_word = np.zeros(self.dim)
#                 vec2mean.append(weighted_word)
#             doc2vec.append(np.mean(vec2mean,axis=0))
#         return np.array(doc2vec)

    def transform(self, X):
        np_ar = np.array([
                np.mean([self.word2vec[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])
        return np_ar


def words_not_w2vec(vocab, w2v):
    new_words = []
    percent_not_words = 0.0
    for word,i in list(vocab):
        if(word not in w2v):
            new_words.append(word)
    percent_not_words = len(new_words) / len(vocab) * 100
    return new_words, percent_not_words

In [10]:
x = ['this','this']
set(x)

{'this'}

In [11]:
x_path = 'cls-acl10-unprocessed/'
# x_path = '/Users/vjstark/Downloads/ADM stuff/cls-acl10-unprocessed/'
#ENGLISH
parsed_xml = ET.parse(x_path+'en/books/train.review')
parsed_xml_test = ET.parse(x_path+'en/books/test.review')
parsed_xml_unlabeled = ET.parse(x_path+'en/books/unlabeled.review')
#FRENCH
parsed_xml_train_fr = ET.parse(x_path+'fr/books/train.review')
parsed_xml_test_fr = ET.parse(x_path+'fr/books/test.review')
parsed_xml_unlabeled_fr = ET.parse(x_path+'fr/books/unlabeled.review')
#GERMAN
parsed_xml_train_de = ET.parse(x_path+'de/books/train.review')
parsed_xml_test_de = ET.parse(x_path+'de/books/test.review')
parsed_xml_unlabeled_de = ET.parse(x_path+'de/books/unlabeled.review')

In [12]:
w2v = read_w2v('english')

200001
53.79565095901489


In [13]:
w2v_fr = read_w2v('french')

200001
54.02349257469177


In [14]:
w2v_de = read_w2v('german')

200001
57.280484676361084


In [15]:
#English
X, y = data_parse(parsed_xml,'english')
X_test, y_test = data_parse(parsed_xml_test,'english')
X_unlabeled, y_unlabeled = data_parse(parsed_xml_unlabeled,'english')

In [16]:
#French
# X_train_fr, y_train_fr = data_parse(parsed_xml_train_fr,'french')
X_test_fr, y_test_fr = data_parse(parsed_xml_test_fr,'french')
X_unlabeled_fr, y_unlabeled_fr = data_parse(parsed_xml_unlabeled_fr, 'french')

In [17]:
#German
# X_train_de, y_train_de = data_parse(parsed_xml_train_de,'german')
X_test_de, y_test_de = data_parse(parsed_xml_test_de,'german')
X_unlabeled_de, y_unlabeled_de = data_parse(parsed_xml_unlabeled_de, 'german')

In [18]:
print(f'EN-ul: {len(X_unlabeled)}, FR-ul: {len(X_unlabeled_fr)}, DE-ul: {len(X_unlabeled_de)}')

EN-ul: 50000, FR-ul: 32870, DE-ul: 165470


In [19]:
#len(X_unlabeled)
len(X_test_fr)
# X_test_fr = pickle.load(open('X_test_fr.pkl', 'rb')) 
# y_test_fr = pickle.load(open('y_test_fr.pkl', 'rb')) 

2000

In [20]:
cpos = 0
cneg = 0
for i in y_test_fr:
    if i == 'positive':
        cpos += 1
    else:
        cneg += 1
        
print(cpos, cneg)

1000 1000


In [21]:
def vectorize_predict(w2v_lang, X_data, y_data,clf):
    t0 = time.time()
    vectorizer_data = CountEmbeddingVectorizer(w2v_lang)
    X ,vocab = vectorizer_data.fit(X_data)
    X_vect_data = vectorizer_data.transform(X_data)
    result = clf.predict(X_vect_data)
    print ('Accuracy Score ', metrics.accuracy_score(y_data, result))
    print ('F1 Score ',metrics.f1_score(y_data, result,average="binary", pos_label="negative"))
    print ('Precision Score ',metrics.precision_score(y_data, result,average="binary", pos_label="negative"))
    print ('Recall Score ',metrics.recall_score(y_data, result,average="binary", pos_label="negative"))
    print ('Confusion matrix ',metrics.confusion_matrix(y_data, result))
    print(time.time()-t0)
    return words_not_w2vec(vocab,w2v_lang)

### En-En, En-Fr, En-De

In [22]:
from sklearn.pipeline import Pipeline

vectorizer = CountEmbeddingVectorizer(w2v)
vect,vocab = vectorizer.fit(X_unlabeled)
X_t = vectorizer.transform(X_unlabeled)
clfLSVC = LinearSVC()
LSVC = CalibratedClassifierCV(clfLSVC)
unknown_words_list_en, percentage = words_not_w2vec(vocab,w2v)
print(len(unknown_words_list_en),percentage)

Vocab Size 10867
175 1.6103800496917273


In [23]:
LSVC.fit(X_t, y_unlabeled)



CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv='warn', method='sigmoid')

In [24]:
unknown_words_test_en = vectorize_predict(w2v, X_test,y_test, LSVC)
len(unknown_words_test_en)

# Vocab Size 6996
# Accuracy Score  0.81882
# F1 Score  0.8223063494242953
# Precision Score  0.8067818790654709
# Recall Score  0.83844
# 31.686428785324097
#60

Vocab Size 20857
Accuracy Score  0.855
F1 Score  0.8570019723865878
Precision Score  0.8453307392996109
Recall Score  0.869
Confusion matrix  [[869 131]
 [159 841]]
1.2595529556274414


2

In [25]:
print(X_test_fr[1])

['super', 'recettes', 'faciles', 'à', 'réaliser', 'recettes', 'appréciées', 'de', 'toute', 'la', 'famille', 'petits', 'et', 'grandsde', 'plus', 'on', 'peut', 'faire', 'son', 'régime', 'en', 'ayant', 'de', 'invités', 'il', 'ny', 'voient', 'que', 'du', 'feupour', 'la', 'vinaigrette', 'il', 'ne', 'faut', 'surtout', 'pa', 'dire', 'quelle', 'est', 'faite', 'avec', 'de', 'lhuile', 'de', 'parafine', 'alors', 'elle', 'est', 'excellente', 'sinonle', 'régime', 'est', 'super', 'efficace', 'il', 'ne', 'fatigue', 'pa', 'du', 'toutjencourage', 'ceux', 'qui', 'ont', 'de', 'kilo', 'en', 'tropà', 'le', 'faireil', 'ne', 'faut', 'pa', 'beaucoup', 'de', 'volonté', 'car', 'on', 'mange', 'toujours', 'à', 'sa', 'faim']


In [26]:
unknown_words_test_fr = vectorize_predict(w2v_fr, X_test_fr,y_test_fr, LSVC)
len(unknown_words_test_fr)
# Vocab Size 6261
# Accuracy Score  0.7598722239123821
# F1 Score  0.7549594858899132
# Precision Score  0.7707276876267748
# Recall Score  0.7398235473075753
# 21.79134178161621
# 534

Vocab Size 22160
Accuracy Score  0.5305
F1 Score  0.12160898035547242
Precision Score  0.9420289855072463
Recall Score  0.065
Confusion matrix  [[ 65 935]
 [  4 996]]
0.8013656139373779


2

In [27]:
unknown_words_test_de = vectorize_predict(w2v_de, X_test_de,y_test_de, LSVC)
len(unknown_words_test_de)

# Vocab Size 7575
# Accuracy Score  0.7439596301444371
# F1 Score  0.7789966771516356
# Precision Score  0.6852286909918508
# Recall Score  0.9024959207107028
# 93.78028178215027
# 100

Vocab Size 32891
Accuracy Score  0.741
F1 Score  0.7825356842989084
Precision Score  0.6743849493487699
Recall Score  0.932
Confusion matrix  [[932  68]
 [450 550]]
1.2705869674682617


2

### Fr-Fr, Fr-En, Fr-De

In [28]:
vectorizer = CountEmbeddingVectorizer(w2v_fr)
vect,vocab = vectorizer.fit(X_unlabeled_fr)
X_t = vectorizer.transform(X_unlabeled_fr)
clfLSVC = LinearSVC()
LSVC = CalibratedClassifierCV(clfLSVC)
unknown_words_list_fr = words_not_w2vec(vocab,w2v_fr)
print(len(unknown_words_list_fr))

Vocab Size 10414
2


In [29]:
LSVC.fit(X_t, y_unlabeled_fr)



CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv='warn', method='sigmoid')

In [30]:
unknown_words_test_en = vectorize_predict(w2v, X_test,y_test, LSVC)
len(unknown_words_test_en)

# Vocab Size 6996
# Accuracy Score  0.51658
# F1 Score  0.5918922113225387
# Precision Score  0.5121103222602039
# Recall Score  0.70112
# 37.17140984535217
# 60

Vocab Size 20857
Accuracy Score  0.6605
F1 Score  0.7421192556019749
Precision Score  0.5982853643600735
Recall Score  0.977
Confusion matrix  [[977  23]
 [656 344]]
1.2004690170288086


2

In [31]:
unknown_words_test_fr = vectorize_predict(w2v_fr, X_test_fr,y_test_fr, LSVC)
len(unknown_words_test_fr)

# Vocab Size 6261
# Accuracy Score  0.52299969577122
# F1 Score  0.6233635206226429
# Precision Score  0.5150035722791141
# Recall Score  0.7894736842105263
# 14.635821104049683
# 534

Vocab Size 22160
Accuracy Score  0.8415
F1 Score  0.8443789887088856
Precision Score  0.8293153326904532
Recall Score  0.86
Confusion matrix  [[860 140]
 [177 823]]
0.8081769943237305


2

In [32]:
unknown_words_test_de = vectorize_predict(w2v_de, X_test_de,y_test_de, LSVC)
len(unknown_words_test_de)

# Vocab Size 7575
# Accuracy Score  0.46508732700791683
# F1 Score  0.46738554854859676
# Precision Score  0.46538604417069107
# Recall Score  0.46940230857557264
# 105.32542681694031
# 100

Vocab Size 32891
Accuracy Score  0.5965
F1 Score  0.7100251527128997
Precision Score  0.554122265844083
Recall Score  0.988
Confusion matrix  [[988  12]
 [795 205]]
1.1001012325286865


2

### De-De, De-En, De-Fr

In [33]:
vectorizer = CountEmbeddingVectorizer(w2v_de)
vect,vocab = vectorizer.fit(X_unlabeled_de)
X_t = vectorizer.transform(X_unlabeled_de)
clfLSVC = LinearSVC()
LSVC = CalibratedClassifierCV(clfLSVC)
unknown_words_list_de = words_not_w2vec(vocab,w2v_de)
print(len(unknown_words_list_de))

Vocab Size 12785
2


In [34]:
LSVC.fit(X_t, y_unlabeled_de)



CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv='warn', method='sigmoid')

In [35]:
unknown_words_test_en = vectorize_predict(w2v, X_test,y_test, LSVC)
len(unknown_words_test_en)

# Vocab Size 6996
# Accuracy Score  0.7691
# F1 Score  0.7481072589618833
# Precision Score  0.8229251668026688
# Recall Score  0.68576
# 35.73703718185425
# 60

Vocab Size 20857
Accuracy Score  0.669
F1 Score  0.5284900284900285
Precision Score  0.9183168316831684
Recall Score  0.371
Confusion matrix  [[371 629]
 [ 33 967]]
1.314728021621704


2

In [36]:
unknown_words_test_fr = vectorize_predict(w2v_fr, X_test_fr,y_test_fr, LSVC)
len(unknown_words_test_fr)

# Vocab Size 6261
# Accuracy Score  0.7205962884088835
# F1 Score  0.7501904036557501
# Precision Score  0.6783412858478036
# Recall Score  0.8390629753574688
# 14.586955070495605
# 534

Vocab Size 22160
Accuracy Score  0.624
F1 Score  0.4413075780089153
Precision Score  0.8583815028901735
Recall Score  0.297
Confusion matrix  [[297 703]
 [ 49 951]]
0.8092331886291504


2

In [37]:
unknown_words_test_de = vectorize_predict(w2v_de, X_test_de,y_test_de, LSVC)
len(unknown_words_test_de)

# Vocab Size 7575
# Accuracy Score  0.81372454221309
# F1 Score  0.8170492114650665
# Precision Score  0.8027221198479159
# Recall Score  0.8318970206079652
# 97.7156879901886
# 100

Vocab Size 32891
Accuracy Score  0.848
F1 Score  0.8472361809045227
Precision Score  0.8515151515151516
Recall Score  0.843
Confusion matrix  [[843 157]
 [147 853]]
1.1001272201538086


2

In [38]:
w2v_fr["arrêter"]

array([-6.62844e-02, -5.95874e-02, -4.77496e-02,  9.80034e-03,
       -1.16180e-01, -6.10507e-02,  3.44817e-03, -5.22143e-04,
       -2.17907e-02,  8.00969e-02, -4.13871e-02,  4.40307e-02,
       -5.94686e-02,  5.27718e-02,  2.43733e-02,  1.48229e-02,
       -6.33436e-02, -4.37056e-02, -1.10108e-01,  6.81111e-02,
       -4.49096e-02,  1.12009e-02, -1.33384e-01, -6.53590e-02,
       -5.25449e-03, -5.62300e-02, -1.68066e-02, -1.21397e-02,
       -3.22654e-04,  4.80870e-02,  3.34291e-03,  1.09416e-01,
       -9.66857e-02, -1.74348e-02, -5.12598e-03, -7.13200e-02,
        4.03671e-02, -6.44001e-02,  3.86578e-02,  2.84616e-02,
        2.31704e-02,  2.75342e-02, -8.73379e-03,  3.39149e-02,
       -7.58863e-03,  5.24873e-02,  2.68174e-02,  3.49733e-02,
       -6.78260e-02, -4.19187e-02, -7.92524e-03, -5.71953e-02,
       -2.19931e-02, -4.17515e-02, -1.00213e-01,  1.85027e-02,
        3.74559e-02,  3.81762e-02, -5.96340e-03, -3.39235e-02,
       -1.20289e-01, -8.86334e-02,  9.12358e-02, -1.908