# Language classifier for single words based on n-grams at a character level

In [None]:
import re
!pip install git+https://github.com/jimmycallin/plainstream.git --quiet
import plainstream
import numpy as np
import random

# Preparation for the training

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Model training

from scipy.sparse import lil_matrix, csr_matrix 
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import  MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import lil_matrix, csr_matrix 

# drive

from google.colab import drive
drive.mount("/content/drive/")

# others

from collections import Counter
import time
import pickle
from pprint import pprint

In [None]:
# setting the directory 

%cd '/content/drive/MyDrive/MACHINE_LEARNING/MID_PROJ_FILES'

/content/drive/MyDrive/MACHINE_LEARNING/MID_PROJ_FILES


# Getting data for training

### Functions to clean the data and build the corpora

In [None]:
def tokenizer(text):

  """Takes in a text and returns it tokenized and clean"""

  l_word_list = [ ]

  for word in text:
    spl = re.split(r"[\'\(\)\-\"!?:.,;«»_“”‘’\s\d+]", word)  
    no_names = [re.sub(r"\s*[A-Z]\w*\s*", " ", i).strip() for i in spl]
    low_w = [w.lower() for w in no_names]      # lower case
    l_word_list.append(low_w)
    words = sum(l_word_list, [])         # all the words in the same list
    tokens = list(filter(None, words))   # to remove empty values from the list

  return tokens


def start_end_tags(word_list):

  """appends start/end label to characters in each word """

  tagged_w_list = []

  for word in word_list:
    s_tag = '^'+word[0]
    e_tag = word[-1]+'$'
    tagged_word = s_tag+word[1:-1]+e_tag
    tagged_w_list.append(tagged_word)

  return tagged_w_list


def corpus_min_tag(corpus, min, len_corp):

  """filters the words that will constitute the corpus according to a minimum length"""

  corpus_set = list(set(corpus))
  corpus_min = [word for word in corpus_set if len(word) > min]
  tag_corpus = start_end_tags(corpus_min)
  reduced_corpus_set = random.sample(tag_corpus, len_corp)
  reduced_corpus_set.sort(reverse= False)

  return reduced_corpus_set

### Corpora

Wikipedia articles sampled through Plainstream

In [None]:
# ENGLISH

en_raw = [i for i in plainstream.get_text('en', max_words=500000, tokenize=False)] # 121 articles
en_clean = tokenizer(en_raw)
non_en_words = [ word for word in en_clean for el in word if el not in 'abcdefghijklmnopqrstuvwxyz']
en_corpus = [word for word in en_clean if word not in non_en_words]  # len 505849

counter =  Counter(en_corpus)
en_filtered = [k for k, c in counter.items() if c >= 10] # if word occurs more than 10 in the whole corpus

EN_corpus = corpus_min_tag(en_filtered, 4, 3500)

# ITALIAN

it_raw = [i for i in plainstream.get_text('it', max_words=500000, tokenize=False)] # 444 articles

it_clean = tokenizer(it_raw)      # tot tokens 519712 
non_it_words = [ word for word in it_clean for el in word if el not in 'abcdefghijklmnopqrstuvwxyzàèéìòù']
non_it_start = ('aa', 'ee', 'ii', 'uu', 'é', 'è', 'ì', 'à', 'ò', 'ù') 
it_corpus = [word for word in it_clean if word not in non_it_words if word.startswith(non_it_start) == False] # len 510058

counter =  Counter(it_corpus)
it_filtered = [k for k, c in counter.items() if c >= 10] # 

IT_corpus = corpus_min_tag(it_filtered, 4, 3500)

# GERMAN

de_raw = [i for i in plainstream.get_text('de', max_words=750000, tokenize=False)] 

de_clean = tokenizer(de_raw)    
non_de_words = [ word for word in de_clean for el in word if el not in 'abcdefghijklmnopqrstuvwxyzäöüß']
de_corpus = [word for word in de_clean if word not in non_de_words] 

counter =  Counter(de_corpus)
de_filtered = [k for k, c in counter.items() if c >= 10] 

DE_corpus = corpus_min_tag(de_filtered, 4, 2800)  # less tokens but longer words

# FRENCH

fr_raw = [i for i in plainstream.get_text('fr', max_words=500000, tokenize=False)] 

fr_clean = tokenizer(fr_raw)     
non_fr_words = [ word for word in fr_clean for el in word if el not in 'abcçdefghijklmnopqrstuvwxyzâæàèéêëîïôœùûüÿ']
fr_corpus = [word for word in fr_clean if word not in non_fr_words] 

counter =  Counter(fr_corpus)
fr_filtered = [k for k, c in counter.items() if c >= 10] 

FR_corpus = corpus_min_tag(fr_filtered, 4, 3500)

# SPANISH

es_raw = [i for i in plainstream.get_text('es', max_words=500000, tokenize=False)] 

es_clean = tokenizer(es_raw)    
# len(es_clean)
non_es_words = [ word for word in es_clean for el in word if el not in 'abcdefghijklmñnopqrstuüvwxyz']
es_corpus = [word for word in es_clean if word not in non_es_words] 
counter =  Counter(es_corpus)
es_filtered = [k for k, c in counter.items() if c >= 10] 

ES_corpus = corpus_min_tag(es_filtered, 4, 3500)

In [None]:
EN_corpus = corpus_min_tag(en_filtered, 4, 3500)
ES_corpus = corpus_min_tag(es_filtered, 4, 3000)
FR_corpus = corpus_min_tag(fr_filtered, 4, 3500)
DE_corpus = corpus_min_tag(de_filtered, 4, 2800) 
IT_corpus = corpus_min_tag(it_filtered, 4, 3500)


In [None]:
print(len(EN_corpus), len(IT_corpus), len(DE_corpus), len(FR_corpus) ,len(ES_corpus))

3500 3500 2800 3500 3000


In [None]:
# # SAVING DATA INTO FILES TO SAVE TIME

# with open('EN_corpus', "wb") as fp:   #Pickling
#   pickle.dump(EN_corpus, fp)

# with open('IT_corpus', "wb") as fp:   #Pickling
#   pickle.dump(IT_corpus, fp)

# with open('DE_corpus', "wb") as fp:   #Pickling
#   pickle.dump(DE_corpus, fp)

# with open('FR_corpus', "wb") as fp:   #Pickling
#   pickle.dump(FR_corpus, fp)

# with open('ES_corpus', "wb") as fp:   #Pickling
#   pickle.dump(ES_corpus, fp)

In [None]:
# set the rigth path to open the files
# %cd /...

with open("EN_corpus", "rb") as fp:   # Unpickling
  EN_corpus = pickle.load(fp)

with open("IT_corpus", "rb") as fp:   # Unpickling
  IT_corpus = pickle.load(fp)

with open("DE_corpus", "rb") as fp:   # Unpickling
  DE_corpus = pickle.load(fp)

with open("FR_corpus", "rb") as fp:   # Unpickling
  FR_corpus = pickle.load(fp)

with open("ES_corpus", "rb") as fp:   # Unpickling
  ES_corpus = pickle.load(fp)

# Training and statistics

### Functions to prepare the data for training and to train the models

In [None]:
# BIGRAMS

def get_bigrams_in_words(words):

  """Given a list of words returns a list of those words in bigrams."""

  bigrams_in_words = [ ]
  for chr in words:
    bigram = list(zip(chr, chr[ 1 : ]))
    # print(bigram)
    bigrams_in_words.append(bigram)
  return bigrams_in_words

# TRIGRAMS

def get_trigrams_in_words(words):

  """Given a list of words returns a list of those words in trigrams."""

  trigrams_in_words = [ ]
  for chr in words:
    trigram = list(zip(chr, chr[ 1 : ], chr[2:]))
    # print(bigram)
    trigrams_in_words.append(trigram)
  return trigrams_in_words


# N-GRAMS AS FEATURES

def Ngrams_features(list_word_ngram):

  """Given a list of word in n-grams returns set of n-grams that compose the words."""

  ngrams_as_feat = [ ]
  for wo_ngram in list_word_ngram:
    for ngram in wo_ngram:
      ngrams_as_feat.append(ngram)
  ngrams_as_feat = list(set(ngrams_as_feat)) 
  return ngrams_as_feat


# CREATES CLASS LIST - works if the corpus is ordered

def language_labels(corpus, *labels): 

  """Given a corpus (balanced if there is more than one language inside)
   and the language(s) of the corpus it generates a list of labels according to
   the language of each word of the corpus. The corpus must be ordered (all the words of lang1 
   followed by all the words of lang2) to make it work properly."""

  corpus_part = int(len(corpus)/len(labels)) 
  lang_labels = [ ]
  for i in labels:
    lang_labels.extend([i]*corpus_part)
  return lang_labels

# COUNT MATRIX - counts features presence for each row

def gen_count_matrix(all_Ngrams, words_Ngrams):  # takes list of bigrams (features) and word + Ngrams   

  """Given a list of n-grams as features and a list of words
  in n-grams returns a count matrix in which words are rows and features are columns."""

  features_matrix = np.zeros((len(words_Ngrams), len(all_Ngrams)))  # empty matrix, rows = words in Ngrams, col = Ngrams as features                  
  for Ngram_index, Ngram in enumerate(all_Ngrams):                       # colonne      
    for word_index, word_Ngrams in enumerate(words_Ngrams):              # rows
      if Ngram in word_Ngrams:                                               
        features_matrix[word_index, Ngram_index] = 1                 # 1 assigned to each Ngram that occurs in the row
 
  return features_matrix

# CSR MATRIX - counts features presence for each row

def gen_csr_matrix(all_Ngrams, words_Ngrams): 

  """Given a list of n-grams as features and a list of words
  in n-grams returns a csr matrix in which words are rows and features are columns.
  It's recommended if the number of features is very high."""

  d = { }  
  for i, word in enumerate(words_Ngrams):
    tempd = { key:0 for key in all_Ngrams} # contains all the types in vocab and each values sarts from 0
    for n_gram in word:
      tempd[n_gram] += 1
      d[i] = tempd # adding the dict tempd d to d that contains as key the index of the text

  list_of_val = [ ]
  for key, val in d.items():
    ordered = sorted(val.items(), key = lambda x : x[0]) 
    # print(ordered)# orders leading to a list of tuples
    numbers = [freq for word, freq in ordered]
    list_of_val.append(numbers)
  my_csr_matrix = csr_matrix(list_of_val)
  
  return my_csr_matrix

def gen_tfidfd_matrix(all_Ngrams, words_Ngrams): 

  """Given a list of n-grams as features and a list of words
  in n-grams returns a tfidf matrix in which words are rows and features are columns."""

  vocabulary = {term:0 for term in all_Ngrams}
  idf_counts = {term:0 for term in vocabulary}
  ntexts = len(words_Ngrams)
  out = {}

  ###### TF ####
  for i, word in enumerate(words_Ngrams):
    tempd = {term:0 for term in vocabulary}
    terms = set(word)
    for term in terms:
      vocabulary[term] += 1
    for term in word:
      tempd[term] += 1
    out[i] = tempd

  ## IDF ##
  idf = {key:np.log(ntexts/(val+1)) for key,val in vocabulary.items()}
  for wordid in out:
    for term in out[wordid]:
      if out[wordid][term] != 0:
        out[wordid][term] = out[wordid][term] * idf[term] 
        
  #### values into a matrix
  
  list_for_matrix = []
  rows = [row for index, row in out.items()]
  for word in rows:
    row_values = list(word.values())
    list_for_matrix.append(row_values)
    tfidf_matrix = np.array(list_for_matrix)

  return tfidf_matrix


def model_trainer_tester(clf, matrix, labels):

  """ Takes in a matrix, a classifier and a list of labels and train the model
  of the desired classifier with the data, returning the time of execution, the mean accuracy 
  score obtained by cross validation and the
  number of exact guess performed after the test phase. """

  X_train, X_test, y_train, y_test = train_test_split(matrix, labels, test_size=0.2, random_state=42, shuffle=True) # data partition
  # start_time = time.time()
  le = preprocessing.LabelEncoder()
  language_label_encoded = le.fit_transform(y_train)
  training = clf.fit(X_train, y_train)
  cross_score = cross_val_score(clf, X_train, y_train, cv=5)  # corss validation
  score_mean = cross_score.mean()
  clf_test = clf.predict(X_test)
  clf_test_ok_guess = 100-round((y_test != clf_test).sum()*100/X_test.shape[0], 2)
  # ex_time = (time.time() - start_time)
  return training, score_mean, clf_test_ok_guess #, ex_time

#### Sequence in bigrams matrix

Approach:

1. See how many features there are in common among the languages involved
2. Count frequency of these features
3. For the most frequent common features Vectorize the neighbors of the features to consider the vector as a weight to be applied in the count matrix
4. Create multilingual corpora, convert them into matrix
5. Train and test models


In [None]:
def lang_feat_info(corpus):

  """Takes in a corpus (list of words) and returns: 
  - a list of words in n-gram format
  - set of n-grams of which the words are composed in a list
  - most common n-grams
  """

  word_feat = get_bigrams_in_words(corpus)
  ngrams_as_feat = [ ]
  for wo_ngram in word_feat:
      for ngram in wo_ngram:
        ngrams_as_feat.append(ngram)
  counter =  Counter(ngrams_as_feat)
  most_common = counter.most_common(20) # change the value when applying this to the whole corpus
  feat = list(set(ngrams_as_feat))

  return word_feat, feat, most_common


def dict_neighbor_ngrams_frequency(word_in_ngrams, common_feat):

  """Takes in a nested list with n-grams in tuples
  returns a dictionary with the frequency of bigram neighbors for each bigram mapped"""
  
  listamia = [ ]
  diziomio = { key : [ ] for key in common_feat }
  # print(diziomio)
  for word in word_in_ngrams:
      l = len(word)
      for i, tup in enumerate(word):
        if tup in diziomio:
          if i < l-1:
            next_tup = word[i+1]
            # print(tup, next_tup)
            listamia.append([tup, next_tup])
  
  for pair in listamia:
    curr = pair[0]
    next = pair[1]
    if curr not in diziomio:
      diziomio[curr] = [next]
    else:
      diziomio[curr].append(next)
  
  
  for key,val in diziomio.items():
    cnt = Counter(val)

    diziomio[key] = dict(cnt)

  return diziomio

In [None]:
def rel_freq(list_tuples_freq):

  """Given a tuple of values returns 
  the relative frequency of each value."""

  l1 = list_tuples_freq[0][0]
  l2 =  list_tuples_freq[1][0]
  fl1 = list_tuples_freq[0][1]
  fl2 = list_tuples_freq[1][1]

  tot_oc = fl1+fl2
  rfl1 = fl1/tot_oc
  rfl2 = fl2/tot_oc

  return rfl1, rfl2, (l1, l2)


def compare_freq_neigh_bi(dict1, dict2):

  """Given two corpora returns a list made of dictionaries in which the keys are
  the n-grams in common and for each key the frequency of every neighbor n-gram in each
  language is returned
  """

  redcombined_dict = {key:(dict1[key], dict2[key]) for key in dict1} # dictionary in which all the neighbors and their freq of lang1 and lang2 are stored for each bigram

  common_neigh_for_bi = [ ]

  for key, val in sorted(redcombined_dict.items()):
    it_occ = val[0]
    en_occ = val[1]
    freq_common_neigh = []
    for bi in it_occ.keys():
      if bi in en_occ.keys():
        rel_freq_forlang = rel_freq([('it', it_occ[bi]), ('en', en_occ[bi])])
        freq_common_neigh.append({bi : rel_freq_forlang[0:2]}) # neighbor frequency in each lang

    common_neigh_for_bi.append({key : freq_common_neigh})

  return common_neigh_for_bi

def get_feat_val_for_neighbors_M(list_of_nested_dict):

  """Takes in a list of nested dictionaries and returns:
  - n-gram sequency as string - to be used as features for the matrix
  - relative frequency of each n-gram sequency for language in a tuple"""

  featredred = [ ]
  val_l1 = [ ]
  val_l2 = [ ]

  for el in list_of_nested_dict:
    for key, val in el.items():
      featkey = str(key)+'_'
      for dictf in val:
        for kn, val_tup in dictf.items():
          featval = str(kn)
          # print(featval)
          featname = featkey+featval
          l1_v = val_tup[0]
          l2_v = val_tup[1]

        featredred.append(featname)
        val_l1.append(l1_v)
        val_l2.append(l2_v)

  return featredred, val_l1, val_l2

def word_as_ngram_seq(word_in_ngrams):

  """Takes in a nested list with n-grams in tuples
  composing each word and returns a nested list in which each word is mapped as a sequence of n-grams"""
  
  words_as_ngr_seq = [ ]

  for word in word_in_ngrams:
    listamia = [ ]
    
    l = len(word)
    for i, tup in enumerate(word):
      if i < l-1:
        next_tup = word[i+1]
        seq_ngram = str(tup)+'_'+str(next_tup)
        # print(tup, next_tup)
        listamia.append(seq_ngram)
      
    words_as_ngr_seq.append(listamia)

  return words_as_ngr_seq

def weights_sum(matrix):

  """Given the weight matrix it sums all the values for each row."""

  w_per_row = [ ]
  for row in matrix:
    sum_it_weights = sum(row)
    w_per_row.append([sum_it_weights]) # here there are the values for each word stored. it will be the column 1

  return w_per_row

def gen_seq_weigh_matrix(corpus1, corpus2):

  """Given two corpora as list of words:
  - extracts the n-grams (features) and converts the words in n-grams
  - maps in a dictionary the frequency of common following n-gram for
    n-gram that the two languages have in common
  - transforms the previous-next n-gram in features and gets the relative frequency for each language
  - creates a matrix with all the relaitve frequencies values for previous-next n-gram (columns) and language (rows)
  - creates a count matrix with previous-next n-gram columns and words in previous-next n-gram as rows
  - multiplies each row of the weight matrix to the count one
  - creates two additional columns in the count matrix 
    in which the sum of the weights for each language for row is stored
    """

  l1_bi, l1_feat, l1_most_freq = lang_feat_info(corpus1)
  l2_bi, l2_feat, l2_most_freq = lang_feat_info(corpus2)
  common_l1_l2 = list(set(l2_feat).intersection(l1_feat)) 

  l1_bi_neigh = dict_neighbor_ngrams_frequency(l1_bi, common_l1_l2) # dizionario intero
  l2_bi_neigh = dict_neighbor_ngrams_frequency(l2_bi, common_l1_l2)

  check = [k for k in l1_bi_neigh.keys()] == [k for k in l2_bi_neigh.keys()]

  l1l2_n_freq_comparison = compare_freq_neigh_bi(l1_bi_neigh, l2_bi_neigh) # corrispondono ai bigrammi in comune alle due lingue

  seq_as_ft, values_l1, values_l2 = get_feat_val_for_neighbors_M(l1l2_n_freq_comparison)

  l1l2_seq_val_M = np.array([values_l1, values_l2])
  l1l2_seq_val_M.shape

  l1l2_seq = word_as_ngram_seq(l1_bi+l2_bi) # colonna words as sequence of bigrams
  l1l2_CM_seq = gen_count_matrix(seq_as_ft, l1l2_seq) 

  weighted_M_l1 = np.multiply(l1l2_CM_seq, l1l2_seq_val_M[0][:])
  weighted_M_l2 = np.multiply(l1l2_CM_seq, l1l2_seq_val_M[1][:])

  l1_w_col = weights_sum(weighted_M_l1)
  l2_w_col = weights_sum(weighted_M_l2) 

  matrix_col_l1 = np.append(l1l2_CM_seq, l1_w_col, axis = 1)
  sequence_weighted_matrix = np.append(matrix_col_l1, l2_w_col, axis = 1)
  
  return sequence_weighted_matrix


### Attempt on a sub corpus

In [None]:
RED_IT = random.sample(IT_corpus, 100)
RED_EN = random.sample(EN_corpus, 100)

In [None]:
# GETTING INFO ABOUT FEATURES OF THE TWO CORPORA

# list of lists with tuples
# list of unique features
# Count of most common uique features

redIT_bi, redIT_feat, redIT_most_freq = lang_feat_info(RED_IT)
redEN_bi, redEN_feat, redEN_most_freq = lang_feat_info(RED_EN)

In [None]:
# COMMON FEATURES BETWEEN LANGUAGES

redcommon_IT_EN = list(set(redEN_feat).intersection(redIT_feat))
redcommon_IT_EN # 149

In [None]:
# UNIQUE FEATURES OF EACH LANGUAGE

redunique_EN = [feat for feat in redEN_feat if feat not in redcommon_IT_EN] # 66
redunique_IT = [feat for feat in redIT_feat if feat not in redcommon_IT_EN] # 52

In [None]:
redIT_bi_neigh = dict_neighbor_ngrams_frequency(redIT_bi, redcommon_IT_EN)
redEN_bi_neigh = dict_neighbor_ngrams_frequency(redEN_bi, redcommon_IT_EN)

[k for k in redIT_bi_neigh.keys()] == [k for k in redEN_bi_neigh.keys()]  # True

True

In [None]:
red_comparison_freq_ne = compare_freq_neigh_bi(redIT_bi_neigh, redEN_bi_neigh)

### Working on the whole corpus

##### For each sub corpus (corresponding to each language) we are getting:

- words in bigrams
- bigrams (features)
- most frequent bigrams for language (this info is not used)


In [None]:
# working on the whole corpus (two sub-corpora)

IT_bi, IT_feat, IT_most_freq = lang_feat_info(IT_corpus)
EN_bi, EN_feat, EN_most_freq = lang_feat_info(EN_corpus)

In [None]:
# list of common features to which we want to assign a weight

common_IT_EN = list(set(EN_feat).intersection(IT_feat)) #296


In [None]:
unique_EN = [feat for feat in EN_feat if feat not in common_IT_EN] # 143
unique_IT = [feat for feat in IT_feat if feat not in common_IT_EN] # 38

In [None]:
# I want a dictionary with the frequency of the bigram neighbors for each bigram in common
# to calculate the corresponding weights

IT_bi_neigh = dict_neighbor_ngrams_frequency(IT_bi, common_IT_EN) # dizionario intero
EN_bi_neigh = dict_neighbor_ngrams_frequency(EN_bi, common_IT_EN)

check = [k for k in IT_bi_neigh.keys()] == [k for k in EN_bi_neigh.keys()]

In [None]:
# To know in which language a certain n-gram neighbor is more/less frequent 

IT_EN_n_freq_comparison = compare_freq_neigh_bi(IT_bi_neigh, EN_bi_neigh) # 296 items, corrispondono ai bigrammi in comune alle due lingue
pprint(IT_EN_n_freq_comparison[:2])                                                                       


### Weigths calculation based on the occurrence of a sequence of bigrams

- (1) from the frequency of certain sequencies of n-grams for language we get a value for each language and we store it in a matrix. Rows are the values for each languages and features (columns) are the common sequences of n-grams between the languages

- (2) we build a count matrix in which we have the words as n-gram sequence as rows and the common sequences of n-grams between the languages (as in previous matrix)

- (3) we multiply each row of the matrix in (2) for each row of the matrix in (1) separately and store the sum of the weights in a variable.

- (4) we add two more columns to our matrix (2) in which we store the sum of the weights for each language (obtained in (3)) for each word

The matrix in (4) is implemented in the training of the classifier.


##### test on reduced corpus

In [None]:
redred_fqn = IT_EN_n_freq_comparison[:3] 

In [None]:
redred_fqn

In [None]:
ffftt, v_it, v_en = get_feat_val_for_neighbors_M(redred_fqn)

In [None]:
red_C_relFmatrix = np.array([ffftt, v_it, v_en])

In [None]:
# little visualization of the matrix

for line in red_C_relFmatrix:
    print('  '.join(map(str, line)))

## Test on the whole corpus - step by step

-- function doing the whole process in the cell above

In [None]:
seq_as_ft, values_it, values_en = get_feat_val_for_neighbors_M(IT_EN_n_freq_comparison)

In [None]:
# SEQUENCY VALUES MATRIX

ITENseq_val_MFEAT = np.array([seq_as_ft, values_it, values_en]) # with features as title

ITENseq_val_M = np.array([values_it, values_en])
ITENseq_val_M.shape



(2, 1573)

In [None]:
### CORPUS DATA MAPPED IN CM LIKE WEIGHT MATRIX

#(1) words as sequency of prev_next bi instead of normal bigrams

ITEN_seq = word_as_ngram_seq(IT_bi+EN_bi) # colonna words as sequence of bigrams
ITEN_CM_seq = gen_count_matrix(seq_as_ft, ITEN_seq)  #COUNT MATRIX DI BI PER SEQUENZA

In [None]:
ITEN_CM_seq.shape

(7000, 1573)

### Generate the weigthed matrix

In [None]:
ITEN_CM_seq # MAT count words
ITENseq_val_M # MAT weights DI PARTENZA

In [None]:
# weighted_M = np.zeros((len(IT_seq), (len(seq_as_ft))
# weighted_M.shape

weighted_M_it = np.multiply(ITEN_CM_seq, ITENseq_val_M[0][:])
weighted_M_en = np.multiply(ITEN_CM_seq, ITENseq_val_M[1][:])





In [None]:
it_w_col = weights_sum(weighted_M_it)
en_w_col = weights_sum(weighted_M_en) # now i have to add these two columns to the matrix that i will feed to the classifier

In [None]:
prova_matrix_seq = np.append(ITEN_CM_seq, it_w_col, axis = 1)
prova_matrix_seq_complete = np.append(prova_matrix_seq, en_w_col, axis = 1)

In [None]:
prova_matrix_seq_complete.shape

(7000, 1575)

In [None]:
print(prova_matrix_seq_complete[ :2][:]) 

[[1.         0.         0.         ... 0.         6.27827235 2.72172765]
 [1.         0.         0.         ... 0.         4.1951128  1.8048872 ]]


# Let's try some training and test!

## IT-EN corpus

In [None]:
# getting sequence matrix

ITEN_weighted = gen_seq_weigh_matrix(IT_corpus, EN_corpus)



In [None]:
ITENlabels = language_labels(IT_corpus+EN_corpus, 'it', 'en')
ITENlabels[3498:3502]

['it', 'it', 'en', 'en']

### Sequence matrix 

In [None]:
# NAIVE BAYES

seq_NB_ITEN_train, seq_NB_ITEN_score_mean, seq_NB_ITEN_ok_guess = model_trainer_tester(GaussianNB(), ITEN_weighted, ITENlabels)
print('accuracy score:', seq_NB_ITEN_score_mean, 'correct guess in test:', seq_NB_ITEN_ok_guess)

accuracy score: 0.78375 correct guess in test: 78.78999999999999


In [None]:
# MULTINOMIAL BAYES

seq_MNB_ITEN_train, seq_MNB_ITEN_score_mean, seq_MNB_ITEN_ok_guess = model_trainer_tester(MultinomialNB(), ITEN_weighted, ITENlabels)
print('accuracy score:', seq_MNB_ITEN_score_mean, 'correct guess in test:', seq_MNB_ITEN_ok_guess)

accuracy score: 0.8482142857142858 correct guess in test: 84.93


In [None]:
# RANDOM FOREST

seq_RF_ITEN_train, seq_RF_ITEN_score_mean, seq_RF_ITEN_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100), ITEN_weighted, ITENlabels)
print('accuracy score:', seq_RF_ITEN_score_mean, 'correct guess in test:', seq_RF_ITEN_ok_guess)

accuracy score: 0.8880357142857143 correct guess in test: 88.78999999999999


In [None]:
# LOGISTIC REGRESSION

seq_LR_ITEN_train, seq_LR_ITEN_score_mean, seq_LR_ITEN_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'),ITEN_weighted, ITENlabels)
print('accuracy score:', seq_LR_ITEN_score_mean, 'correct guess in test:', seq_LR_ITEN_ok_guess)

accuracy score: 0.8917857142857143 correct guess in test: 89.21000000000001


In [None]:
# SMV RBF

seq_SMV_ITEN_train, seq_SMV_ITEN_score_mean, seq_SMV_ITEN_ok_guess = model_trainer_tester(SVC(kernel='rbf'), ITEN_weighted, ITENlabels)
print('accuracy score:', seq_SMV_ITEN_score_mean, 'correct guess in test:', seq_SMV_ITEN_ok_guess)

accuracy score: 0.8910714285714286 correct guess in test: 88.78999999999999


In [None]:
with open('seq_trainingITEN.pkl', 'wb') as f:  
    pickle.dump([seq_NB_ITEN_score_mean, seq_NB_ITEN_ok_guess, 
                 seq_MNB_ITEN_score_mean, seq_MNB_ITEN_ok_guess,
                 seq_RF_ITEN_score_mean, seq_RF_ITEN_ok_guess,
                 seq_LR_ITEN_score_mean, seq_LR_ITEN_ok_guess,
                 seq_SMV_ITEN_score_mean, seq_SMV_ITEN_ok_guess], f)

# # Getting back the objects:
# with open('objs.pkl') as f:  # Python 3: open(..., 'rb')
#     obj0, obj1, obj2 = pickle.load(f)

### Count matrix

In [None]:
ITEN_corpus = IT_corpus + EN_corpus

ITEN_corpus_bigrams =  get_bigrams_in_words(ITEN_corpus)# bigrams
ITEN_bi_feat = Ngrams_features(ITEN_corpus_bigrams)# features
CM_ITEN = gen_count_matrix(ITEN_bi_feat, ITEN_corpus_bigrams) # matrix

In [None]:
# NAIVE BAYES

CM_NB_ITEN_train, CM_NB_ITEN_score_mean, CM_NB_ITEN_ok_guess = model_trainer_tester(GaussianNB(), CM_ITEN, ITENlabels)
print('accuracy score:', CM_NB_ITEN_score_mean, 'correct guess in test:', CM_NB_ITEN_ok_guess)

accuracy score: 0.785357142857143 correct guess in test: 76.86


In [None]:
# MULTINOMIAL NB

CM_MNB_ITEN_train, CM_MNB_ITEN_score_mean, CM_MNB_ITEN_ok_guess = model_trainer_tester(MultinomialNB(), CM_ITEN, ITENlabels)
print('accuracy score:', CM_MNB_ITEN_score_mean, 'correct guess in test:', CM_MNB_ITEN_ok_guess)

accuracy score: 0.9355357142857142 correct guess in test: 94.36


In [None]:
# RANDOM FOREST

CM_RF_ITEN_train, CM_RF_ITEN_score_mean, CM_RF_ITEN_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100), CM_ITEN, ITENlabels)
print('accuracy score:', CM_RF_ITEN_score_mean, 'correct guess in test:', CM_RF_ITEN_ok_guess)

accuracy score: 0.9430357142857144 correct guess in test: 94.86


In [None]:
# LOGISTIC RGRESSION

CM_LR_ITEN_train, CM_LR_ITEN_score_mean, CM_LR_ITEN_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'), CM_ITEN, ITENlabels)
print('accuracy score:', CM_LR_ITEN_score_mean, 'correct guess in test:', CM_LR_ITEN_ok_guess)

accuracy score: 0.9471428571428572 correct guess in test: 94.86


In [None]:
# SVM RBF

CM_SMV_ITEN_train, CM_SMV_ITEN_score_mean, CM_SMV_ITEN_ok_guess = model_trainer_tester(SVC(kernel='rbf'), CM_ITEN, ITENlabels)
print('accuracy score:', CM_SMV_ITEN_score_mean, 'correct guess in test:', CM_SMV_ITEN_ok_guess)

accuracy score: 0.9469642857142857 correct guess in test: 94.5


In [None]:
with open('CM_trainingITEN.pkl', 'wb') as f:  
    pickle.dump([CM_NB_ITEN_score_mean, CM_NB_ITEN_ok_guess, 
                 CM_MNB_ITEN_score_mean, CM_MNB_ITEN_ok_guess,
                 CM_RF_ITEN_score_mean, CM_RF_ITEN_ok_guess,
                 CM_LR_ITEN_score_mean, CM_LR_ITEN_ok_guess,
                 CM_SMV_ITEN_score_mean, CM_SMV_ITEN_ok_guess], f)

# # Getting back the objects:
# with open('objs.pkl') as f:  # Python 3: open(..., 'rb')
#     obj0, obj1, obj2 = pickle.load(f)

### Tf-idf

In [None]:
tfidf_ITEN = gen_tfidfd_matrix(ITEN_bi_feat, ITEN_corpus_bigrams) # ci mette un botto

In [None]:
with open('tfidf_matrixITEN.pkl', 'wb') as f:  
    pickle.dump(tfidf_ITEN, f)

In [None]:
# NAIVE BAYES

tfidf_NB_ITEN_train, tfidf_NB_ITEN_score_mean, tfidf_NB_ITEN_ok_guess = model_trainer_tester(GaussianNB(), tfidf_ITEN, ITENlabels)
print('accuracy score:', tfidf_NB_ITEN_score_mean, 'correct guess in test:', tfidf_NB_ITEN_ok_guess)

accuracy score: 0.7844642857142856 correct guess in test: 76.71000000000001


In [None]:
# MULTINOMIAL NB

tfidf_MNB_ITEN_train, tfidf_MNB_ITEN_score_mean, tfidf_MNB_ITEN_ok_guess = model_trainer_tester(MultinomialNB(), tfidf_ITEN, ITENlabels)
print('accuracy score:', tfidf_MNB_ITEN_score_mean, 'correct guess in test:', tfidf_MNB_ITEN_ok_guess)

accuracy score: 0.9357142857142857 correct guess in test: 94.57


In [None]:
# RANDOM FOREST

tfidf_RF_ITEN_train, tfidf_RF_ITEN_score_mean, tfidf_RF_ITEN_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100), tfidf_ITEN, ITENlabels)
print('accuracy score:', tfidf_RF_ITEN_score_mean, 'correct guess in test:', tfidf_RF_ITEN_ok_guess)

accuracy score: 0.9433928571428571 correct guess in test: 94.79


In [None]:
# LOGISTIC RGRESSION

tfidf_LR_ITEN_train, tfidf_LR_ITEN_score_mean, tfidf_LR_ITEN_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'), tfidf_ITEN, ITENlabels)
print('accuracy score:', tfidf_LR_ITEN_score_mean, 'correct guess in test:', tfidf_LR_ITEN_ok_guess)

accuracy score: 0.9419642857142856 correct guess in test: 94.93


In [None]:
# SVM RBF

tfidf_SMV_ITEN_train, tfidf_SMV_ITEN_score_mean, tfidf_SMV_ITEN_ok_guess = model_trainer_tester(SVC(kernel='rbf'), tfidf_ITEN, ITENlabels)
print('accuracy score:', tfidf_SMV_ITEN_score_mean, 'correct guess in test:', tfidf_SMV_ITEN_ok_guess)

accuracy score: 0.9487500000000001 correct guess in test: 95.21


In [None]:
with open('tfidf_trainingITEN.pkl', 'wb') as f:  
    pickle.dump([tfidf_NB_ITEN_score_mean, tfidf_NB_ITEN_ok_guess, 
                 tfidf_MNB_ITEN_score_mean, tfidf_MNB_ITEN_ok_guess,
                 tfidf_RF_ITEN_score_mean, tfidf_RF_ITEN_ok_guess,
                 tfidf_LR_ITEN_score_mean, tfidf_LR_ITEN_ok_guess,
                 tfidf_SMV_ITEN_score_mean, tfidf_SMV_ITEN_ok_guess], f)

## IT_DE corpus

In this case we have to reduce the IT corpus to 2500 tokens to be balanced with the DE one.

In [None]:
IT_red_corpus = random.sample(IT_corpus, 2800)

### Sequence matrix

In [None]:
ITDE_weighted = gen_seq_weigh_matrix(IT_red_corpus, DE_corpus)
ITDElabels = language_labels(IT_red_corpus+DE_corpus, 'it', 'de')
ITDElabels[2798:2803]

In [None]:
# NAIVE BAYES

seq_NB_ITDE_train, seq_NB_ITDE_score_mean, seq_NB_ITDE_ok_guess = model_trainer_tester(GaussianNB(), ITDE_weighted, ITDElabels)
print('accuracy score:', seq_NB_ITDE_score_mean, 'correct guess in test:', seq_NB_ITDE_ok_guess)

accuracy score: 0.8252232142857142 correct guess in test: 82.5


In [None]:
# MULTINOMIAL NB

seq_MNB_ITDE_train, seq_MNB_ITDE_score_mean, seq_MNB_ITDE_ok_guess = model_trainer_tester(MultinomialNB(), ITDE_weighted, ITDElabels)
print('accuracy score:', seq_MNB_ITDE_score_mean, 'correct guess in test:', seq_MNB_ITDE_ok_guess)

accuracy score: 0.8948660714285716 correct guess in test: 89.55


In [None]:
# RANDOM FOREST

seq_RF_ITDE_train, seq_RF_ITDE_score_mean, seq_RF_ITDE_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100), ITDE_weighted, ITDElabels)
print('accuracy score:', seq_RF_ITDE_score_mean, 'correct guess in test:', seq_RF_ITDE_ok_guess)

accuracy score: 0.9207589285714286 correct guess in test: 92.77


In [None]:
# LOGISITC REGRESSION

seq_LR_ITDE_train, seq_LR_ITDE_score_mean, seq_LR_ITDE_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'),ITDE_weighted, ITDElabels)
print('accuracy score:', seq_LR_ITDE_score_mean, 'correct guess in test:', seq_LR_ITDE_ok_guess)

accuracy score: 0.9227678571428571 correct guess in test: 93.04


In [None]:
# SMV RBF

seq_SMV_ITDE_train, seq_SMV_ITDE_score_mean, seq_SMV_ITDE_ok_guess = model_trainer_tester(SVC(kernel='rbf'), ITDE_weighted, ITDElabels)
print('accuracy score:', seq_SMV_ITDE_score_mean, 'correct guess in test:', seq_SMV_ITDE_ok_guess)

accuracy score: 0.9245535714285713 correct guess in test: 93.21


In [None]:
with open('seq_trainingITDE.pkl', 'wb') as f:  
    pickle.dump([seq_NB_ITDE_score_mean, seq_NB_ITDE_ok_guess, 
                 seq_MNB_ITDE_score_mean, seq_MNB_ITDE_ok_guess,
                 seq_RF_ITDE_score_mean, seq_RF_ITDE_ok_guess,
                 seq_LR_ITDE_score_mean, seq_LR_ITDE_ok_guess,
                 seq_SMV_ITDE_score_mean, seq_SMV_ITDE_ok_guess], f)

### Count matrix

In [None]:
ITDE_corpus = IT_red_corpus + DE_corpus
ITDE_corpus_bigrams =  get_bigrams_in_words(ITDE_corpus)# bigrams
ITDE_bi_feat = Ngrams_features(ITDE_corpus_bigrams)# features
CM_ITDE = gen_count_matrix(ITDE_bi_feat, ITDE_corpus_bigrams) # matrix

In [None]:
# NAIVE BAYES

CM_NB_ITDE_train, CM_NB_ITDE_score_mean, CM_NB_ITDE_ok_guess = model_trainer_tester(GaussianNB(), CM_ITDE, ITDElabels)
print('accuracy score:', CM_NB_ITDE_score_mean, 'correct guess in test:', CM_NB_ITDE_ok_guess)

accuracy score: 0.8205357142857144 correct guess in test: 84.28999999999999


In [None]:
# MULTINOMIAL NB

CM_MNB_ITDE_train, CM_MNB_ITDE_score_mean, CM_MNB_ITDE_ok_guess = model_trainer_tester(MultinomialNB(), CM_ITDE, ITDElabels)
print('accuracy score:', CM_MNB_ITDE_score_mean, 'correct guess in test:', CM_MNB_ITDE_ok_guess)

accuracy score: 0.9573660714285713 correct guess in test: 97.05


In [None]:
# RANDOM FOREST

CM_RF_ITDE_train, CM_RF_ITDE_score_mean, CM_RF_ITDE_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100), CM_ITDE, ITDElabels)
print('accuracy score:', CM_RF_ITDE_score_mean, 'correct guess in test:', CM_RF_ITDE_ok_guess)

accuracy score: 0.9671875 correct guess in test: 97.86


In [None]:
# LOGISTIC REGRESSION

CM_LR_ITDE_train, CM_LR_ITDE_score_mean, CM_LR_ITDE_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'), CM_ITDE, ITDElabels)
print('accuracy score:', CM_LR_ITDE_score_mean, 'correct guess in test:', CM_LR_ITDE_ok_guess)

accuracy score: 0.9694196428571429 correct guess in test: 97.5


In [None]:
# SVM RBF

CM_SMV_ITDE_train, CM_SMV_ITDE_score_mean, CM_SMV_ITDE_ok_guess = model_trainer_tester(SVC(kernel='rbf'), CM_ITDE, ITDElabels)
print('accuracy score:', CM_SMV_ITDE_score_mean, 'correct guess in test:', CM_SMV_ITDE_ok_guess)

accuracy score: 0.9723214285714287 correct guess in test: 98.12


In [None]:
with open('CM_trainingITDE.pkl', 'wb') as f:  
    pickle.dump([CM_NB_ITDE_score_mean, CM_NB_ITDE_ok_guess, 
                 CM_MNB_ITDE_score_mean, CM_MNB_ITDE_ok_guess,
                 CM_RF_ITDE_score_mean, CM_RF_ITDE_ok_guess,
                 CM_LR_ITDE_score_mean, CM_LR_ITDE_ok_guess,
                 CM_SMV_ITDE_score_mean, CM_SMV_ITDE_ok_guess], f)

# # Getting back the objects:
# with open('objs.pkl') as f:  # Python 3: open(..., 'rb')
#     obj0, obj1, obj2 = pickle.load(f)

### Tf-idf

In [None]:
tfidf_ITDE = gen_tfidfd_matrix(ITDE_bi_feat, ITDE_corpus_bigrams) # ci mette un botto

In [None]:
with open('tfidf_matrixITDE.pkl', 'wb') as f:  
    pickle.dump(tfidf_ITDE, f)

In [None]:
# NAIVE BAYES

tfidf_NB_ITDE_train, tfidf_NB_ITDE_score_mean, tfidf_NB_ITDE_ok_guess = model_trainer_tester(GaussianNB(), tfidf_ITDE, ITDElabels)
print('accuracy score:', tfidf_NB_ITDE_score_mean, 'correct guess in test:', tfidf_NB_ITDE_ok_guess)

accuracy score: 0.81875 correct guess in test: 83.93


In [None]:
# MULTINOMIAL NB

tfidf_MNB_ITDE_train, tfidf_MNB_ITDE_score_mean, tfidf_MNB_ITDE_ok_guess = model_trainer_tester(MultinomialNB(), tfidf_ITDE, ITDElabels)
print('accuracy score:', tfidf_MNB_ITDE_score_mean, 'correct guess in test:', tfidf_MNB_ITDE_ok_guess)

accuracy score: 0.9551339285714286 correct guess in test: 96.52


In [None]:
# RANDOM FOREST

tfidf_RF_ITDE_train, tfidf_RF_ITDE_score_mean, tfidf_RF_ITDE_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100), tfidf_ITDE, ITDElabels)
print('accuracy score:', tfidf_RF_ITDE_score_mean, 'correct guess in test:', tfidf_RF_ITDE_ok_guess)

accuracy score: 0.9678571428571429 correct guess in test: 97.77


In [None]:
# LOGISTIC RGRESSION

tfidf_LR_ITDE_train, tfidf_LR_ITDE_score_mean, tfidf_LR_ITDE_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'), tfidf_ITDE, ITDElabels)
print('accuracy score:', tfidf_LR_ITDE_score_mean, 'correct guess in test:', tfidf_LR_ITDE_ok_guess)

accuracy score: 0.9696428571428571 correct guess in test: 96.79


In [None]:
# SVM RBF

tfidf_SMV_ITDE_train, tfidf_SMV_ITDE_score_mean, tfidf_SMV_ITDE_ok_guess = model_trainer_tester(SVC(kernel='rbf'), tfidf_ITDE, ITDElabels)
print('accuracy score:', tfidf_SMV_ITDE_score_mean, 'correct guess in test:', tfidf_SMV_ITDE_ok_guess)

accuracy score: 0.9774553571428571 correct guess in test: 98.21


In [None]:
with open('tfidf_trainingITDE.pkl', 'wb') as f:  
    pickle.dump([tfidf_NB_ITDE_score_mean, tfidf_NB_ITDE_ok_guess, 
                 tfidf_MNB_ITDE_score_mean, tfidf_MNB_ITDE_ok_guess,
                 tfidf_RF_ITDE_score_mean, tfidf_RF_ITDE_ok_guess,
                 tfidf_LR_ITDE_score_mean, tfidf_LR_ITDE_ok_guess,
                 tfidf_SMV_ITDE_score_mean, tfidf_SMV_ITDE_ok_guess], f)

# Table of results: different languages, matrices, classifiers

In [None]:
m1 = ['Naive Bayesian', 'CM', 'bigrams', CM_NB_ITEN_score_mean, CM_NB_ITEN_ok_guess, 'it-en']
m2 = ['Multinomial NB', 'CM', 'bigrams',  CM_MNB_ITEN_score_mean, CM_MNB_ITEN_ok_guess, 'it-en']
m3 = ['Random Forest', 'CM', 'bigrams', CM_RF_ITEN_score_mean, CM_RF_ITEN_ok_guess, 'it-en']
m4 = ['Logistic Regression', 'CM', 'bigrams', CM_LR_ITEN_score_mean, CM_LR_ITEN_ok_guess, 'it-en']
m5 = ['SVM_RBF', 'CM', 'bigrams', CM_SMV_ITEN_score_mean, CM_SMV_ITEN_ok_guess, 'it-en']
m6 = ['Naive Bayesian', 'SEQ', 'bigrams', seq_NB_ITEN_score_mean, seq_NB_ITEN_ok_guess, 'it-en']
m7 = ['Multinomial NB', 'SEQ', 'bigrams',   seq_MNB_ITEN_score_mean, seq_MNB_ITEN_ok_guess, 'it-en']
m8 = ['Random Forest', 'SEQ', 'bigrams',  seq_RF_ITEN_score_mean, seq_RF_ITEN_ok_guess, 'it-en']
m9 = ['Logistic Regression', 'SEQ', 'bigrams', seq_LR_ITEN_score_mean, seq_LR_ITEN_ok_guess, 'it-en']
m10 = ['SVM_RBF', 'SEQ', 'bigrams', seq_SMV_ITEN_score_mean, seq_SMV_ITEN_ok_guess, 'it-en']
m11 = ['Naive Bayesian', 'CM', 'bigrams', CM_NB_ITDE_score_mean, CM_NB_ITDE_ok_guess, 'it-de']
m12 = ['Multinomial NB', 'CM', 'bigrams', CM_MNB_ITDE_score_mean, CM_MNB_ITDE_ok_guess, 'it-de']
m13 = ['Random Forest', 'CM', 'bigrams',  CM_RF_ITDE_score_mean, CM_RF_ITDE_ok_guess, 'it-de']
m14 = ['Logistic Regression', 'CM', 'bigrams', CM_LR_ITDE_score_mean, CM_LR_ITDE_ok_guess,'it-de']
m15 = ['SVM_RBF', 'CM', 'bigrams', CM_SMV_ITDE_score_mean, CM_SMV_ITDE_ok_guess, 'it-de']
m16 = ['Naive Bayesian', 'SEQ', 'bigrams', seq_NB_ITDE_score_mean, seq_NB_ITDE_ok_guess, 'it-de']
m17 = ['Multinomial NB', 'SEQ', 'bigrams', seq_MNB_ITDE_score_mean, seq_MNB_ITDE_ok_guess, 'it-de']
m18 = ['Random Forest', 'SEQ', 'bigrams', seq_RF_ITDE_score_mean, seq_RF_ITDE_ok_guess, 'it-de']
m19 = ['Logistic Regression', 'SEQ', 'bigrams', seq_LR_ITDE_score_mean, seq_LR_ITDE_ok_guess, 'it-de']
m20 = ['SVM_RBF', 'SEQ', 'bigrams', seq_SMV_ITDE_score_mean, seq_SMV_ITDE_ok_guess, 'it-de']
m21 = ['Naive Bayesian', 'tfidf', 'bigrams', tfidf_NB_ITEN_score_mean, tfidf_NB_ITEN_ok_guess, 'it-en']
m22 = ['Multinomial NB', 'tfidf', 'bigrams', tfidf_MNB_ITEN_score_mean, tfidf_MNB_ITEN_ok_guess, 'it-en']
m23 = ['Random Forest', 'tfidf', 'bigrams',  tfidf_RF_ITEN_score_mean, tfidf_RF_ITEN_ok_guess, 'it-en']
m24 = ['Logistic Regression', 'tfidf', 'bigrams', tfidf_LR_ITEN_score_mean, tfidf_LR_ITEN_ok_guess,'it-en']
m25 = ['SVM_RBF', 'tfidf', 'bigrams', tfidf_SMV_ITEN_score_mean, tfidf_SMV_ITEN_ok_guess, 'it-en']
m26 = ['Naive Bayesian', 'tfidf', 'bigrams', tfidf_NB_ITDE_score_mean, tfidf_NB_ITDE_ok_guess, 'it-de']
m27 = ['Multinomial NB', 'tfidf', 'bigrams', tfidf_MNB_ITDE_score_mean, tfidf_MNB_ITDE_ok_guess, 'it-de']
m28 = ['Random Forest', 'tfidf', 'bigrams', tfidf_RF_ITDE_score_mean, tfidf_RF_ITDE_ok_guess, 'it-de']
m29 = ['Logistic Regression', 'tfidf', 'bigrams', tfidf_LR_ITDE_score_mean, tfidf_LR_ITDE_ok_guess, 'it-de']
m30 = ['SVM_RBF', 'tfidf', 'bigrams', tfidf_SMV_ITDE_score_mean, tfidf_SMV_ITDE_ok_guess, 'it-de']


models_to_compare = [m1] + [m2] + [m3] + [m4] + [m5] + [m6] + [m7] + [m8] + [m9] + [m10] + [m11] + [m12] + [m13] + [m14] + [m15] + [m16] + [m17] + [m18] + [m19] + [m20] + [m21] + [m22] + [m23] + [m24] + [m25] + [m26] + [m27] + [m28] + [m29] + [m30]
#print(models_to_compare)


CM_SEQ_tfidf_itende_comparisontable = pd.DataFrame(models_to_compare, columns = ['MODEL', 'MATRIX', 'N-GRAMS', 'ACCURACY', 'TEST', 'LANGUAGES'])
CM_SEQ_tfidf_itende_comparisontable

Unnamed: 0,MODEL,MATRIX,N-GRAMS,ACCURACY,TEST,LANGUAGES
0,Naive Bayesian,CM,bigrams,0.785357,76.86,it-en
1,Multinomial NB,CM,bigrams,0.935536,94.36,it-en
2,Random Forest,CM,bigrams,0.943036,94.86,it-en
3,Logistic Regression,CM,bigrams,0.947143,94.86,it-en
4,SVM_RBF,CM,bigrams,0.946964,94.5,it-en
5,Naive Bayesian,SEQ,bigrams,0.78375,78.79,it-en
6,Multinomial NB,SEQ,bigrams,0.848214,84.93,it-en
7,Random Forest,SEQ,bigrams,0.888036,88.79,it-en
8,Logistic Regression,SEQ,bigrams,0.891786,89.21,it-en
9,SVM_RBF,SEQ,bigrams,0.891071,88.79,it-en


# Test on italian neologisms

In [None]:
!pip install selenium 
!apt-get update 
!apt install chromium-chromedriver 
!cp /usr/lib/chromium-browser/chromedriver /usr/bin  
import sys 
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')  
from selenium import webdriver  
chrome_options = webdriver.ChromeOptions() 
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox') 
chrome_options.add_argument('--disable-dev-shm-usage') 
from bs4 import BeautifulSoup

Let's do some scraping to retrieve a list of neologisms from Treccani and Accademia della Crusca

In [None]:
driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options)

url0 = "https://www.treccani.it"
url1 = "https://www.treccani.it/magazine/lingua_italiana/neologismi/searchNeologismi.jsp?lettera=A&catResult=1072"

driver.get(url1)  
source = driver.page_source 
soup =  BeautifulSoup(source) 
all_pages =  soup.find_all('ul', class_ = 'liv3-nav')

pages_list = [ ]

for el in all_pages:
  x = el.find_all("a")
  for elem in x:
    link = elem.get('href')
    pages_list.append(link)

list_links = [ ]
for chunk in pages_list:
  link_ok = str(url0+chunk)
  list_links.append(link_ok)   # list with all the links in which scrape neologisms in the Treccani website

neolog_list_tr = [ ]

for url in list_links:
  driver.get(url)  
  source = driver.page_source   # get the page source 
  soup =  BeautifulSoup(source) # get the soup

  l = [ ]                                  
  all_lemmi = soup.find_all('div', class_ = 'cont-lemmi')

  for page in all_lemmi:
    all_cols = page.find_all('div', class_ = 'col-lemmi')
    for col in all_cols:
      lemma_all = col.find_all("dd")
      for lemma_raw in lemma_all:
        lemma = lemma_raw.find("a").get_text()
        l.append(lemma)

  

  lemma_list = [ ]                         # clean words

  for elem in l:
    elem = re.sub('\s+', '', elem)
    lemma_list.append(elem)

  neolog_list_tr.extend(lemma_list)

print(neolog_list_tr)

  """Entry point for launching an IPython kernel.




In [None]:
url = "https://accademiadellacrusca.it/it/lingua-italiana/parole-nuove/"

neolog_list_cr = [ ]

driver.get(url)  
source = driver.page_source # get the page source 
soup =  BeautifulSoup(source) # get the soup


all_lemmi = soup.find_all('div', class_ = 'row mb-30')


for page in all_lemmi:
  x = page.find('h2', class_ = 'mb-0 fs-18').find('a').get_text()
  neolog_list_cr.append(x)

print(neolog_list_cr)

['abbattere', 'abilismo', 'ageismo', 'algocrazia', 'algoretica', 'anagrafare', 'anagrafatura', 'audismo', 'badante', 'bioeticista', 'bioterrorismo', 'blastare', 'boomer', 'bralette', 'brassare', 'bufu', 'bullizzare', 'burger', 'camperizzare', 'cartolarizzazione', 'catcalling', 'cica crema', 'coding', 'contact tracer', 'contact tracing', 'coronavirus', 'cringe', 'cuociriso', 'demedicalizzazione', 'didattica a distanza (DAD)', 'disiscrivere, disiscriversi', 'disiscrizione', 'dissare', 'distanziamento sociale', 'docciarsi', 'domotizzare', 'domotizzazione', 'doomscrolling', 'dressare', 'droplet', 'enogastronomo', 'ergodico', 'esitanza vaccinale', 'eskere', 'esodato', 'eurocent', 'eurolandia', 'eurozona', 'fasarsi', 'FOMO (fear of missing out)', 'foodie', 'freezare', 'friendzonare', 'ghosting', 'gialloverde', 'girotondo', 'green pass', 'hashtag', 'hater', 'hipster', 'identità digitale', 'impiattare', 'impigiamare', 'influencer', 'infodemia', 'ipermedicalizzazione', 'lievitista', 'link epide

Let's merge the two corpora

In [None]:
all_neol = neolog_list_tr + neolog_list_cr
it_neol = (list(set(all_neol)))
len(it_neol) # 13175

13300

In [None]:
sample_it_neologisms = random.sample(it_neol, 20) # to test the models on a random sample of 20 neologims
sample_it_neologisms = [el.lower() for el in sample_it_neologisms]


In [None]:
sample_it_neologisms = ['salvaenergia', 'singleattitudiner', 'obamizzare', 'familybanker', 'dissare',  'furbettodelpieno', 'blogging' ]

## Testing the models on the neologisms

Functions 

In [None]:
# makes the results of the tests readable

def results_labels(test, test_words):
  Test = list(test)
  analysis_results = [[word, el] for word, el in zip(test_words, Test)]

  return analysis_results

Test

In [None]:
it_neologisms_list = get_bigrams_in_words(sample_it_neologisms)   # transforms neologisms in lists composed by bigrams
it_neol_matrix = gen_tfidfd_matrix(ITEN_bi_feat, it_neologisms_list)  
it_neolSEQ_matrix = gen_seq_weigh_matrix(IT_corpus, EN_corpus)    # creates matrix with features and words as bigrams


In [None]:
clf = SVC(kernel='rbf')
trainingSVM = clf.fit(tfidf_ITEN, ITENlabels)
SVM =  clf.predict(it_neol_matrix)

In [None]:
SVM

array(['it', 'en', 'it', 'en', 'it', 'it', 'en'], dtype='<U2')

In [None]:
from sklearn import GaussianNB

In [None]:
GNB = GaussianNB()
seqfNB = GNB.fit(ITEN_weighted, ITENlabels)
worst = GNB.predict(it_neolSEQ_matrix)

In [None]:
worst

array(['it', 'it', 'en', ..., 'en', 'en', 'en'], dtype='<U2')

Results

In [None]:
from pprint import pprint

In [None]:
B1_test_neo = results_labels(SVM, sample_it_neologisms)
pprint(B1_test_neo)

[['salvaenergia', 'it'],
 ['singleattitudiner', 'en'],
 ['obamizzare', 'it'],
 ['familybanker', 'en'],
 ['dissare', 'it'],
 ['furbettodelpieno', 'it'],
 ['blogging', 'en']]


In [None]:
RF_1_test_neo = results_labels(RF_1_results_on_it_neol, sample_it_neologisms)
pprint(RF_1_test_neo)

[['xbox', 'en'],
 ['sindacaleggiare', 'it'],
 ['yahooizzarsi', 'en'],
 ['furbettodell’Imu', 'it'],
 ['bio-citta', 'it'],
 ['sartoriano', 'it'],
 ['blacktie', 'en'],
 ['flexinsecurity', 'en'],
 ['L-arginina', 'it'],
 ['panarabista', 'it'],
 ['lobbistico', 'en'],
 ['cattonevrosi', 'it'],
 ['antistronzista', 'it'],
 ['twittare', 'it'],
 ['Paperonopoli', 'it'],
 ['furbettodelcalcetto', 'it'],
 ['cornerfoodhotel', 'en'],
 ['bond-people', 'en'],
 ['inchiestismo', 'en'],
 ['Quirinarie', 'en']]


In [None]:
Bayes_2_tfidf_test_neo = results_labels(Bayes_2_tfidf_results_on_it_neol, sample_it_neologisms)

In [None]:
RF_2_tfidf_test_neo = results_labels(RF_2_tfidf_results_on_it_neol, sample_it_neologisms)