# 1. Pre-processing

## 1.1. Installings

In [None]:
# Corpus 

import re
!pip install git+https://github.com/jimmycallin/plainstream.git --quiet
import plainstream
import numpy as np
import random

# Preparation for the training

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Model training

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import  MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

# drive

from google.colab import drive
drive.mount("/content/drive/")

import time
import pickle
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  Building wheel for plainstream (setup.py) ... [?25l[?25hdone
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/MACHINE_LEARNING/MID_PROJ_FILES'

/content/drive/MyDrive/MACHINE_LEARNING/MID_PROJ_FILES


# DATA


- getting data for each language through Plainstream
- cleaning the data
- creation of different corpora according to variables

### Functions to clean and tokenize data

In [None]:
# tokenizer

def tokenizer(text):

  l_word_list = [ ]

  for word in text:
    spl = re.split(r"[\'\(\)\-\"!?:.,;«»_“”‘’\s\d+]", word)  
    low_w = [w.lower() for w in spl]      # lower case
    l_word_list.append(low_w)
    words = sum(l_word_list, [])         # all the words in the same list
    tokens = list(filter(None, words))   # to remove empty values from the list
  
  return tokens


# appends start/end label to characters in each word 

def start_end_tags(word_list):

  tagged_w_list = []

  for word in word_list:
    s_tag = '^'+word[0]
    e_tag = word[-1]+'$'
    tagged_word = s_tag+word[1:-1]+e_tag
    tagged_w_list.append(tagged_word)

  return tagged_w_list

def corpus_min_tag(corpus, min, len_corp):
  corpus_set = list(set(corpus))
  corpus_min = [word for word in corpus_set if len(word) > min]
  tag_corpus = start_end_tags(corpus_min)
  reduced_corpus_set = random.sample(tag_corpus, len(EN_corpus))
  reduced_corpus_set.sort(reverse= False)

  return reduced_corpus_set

### Getting data from Plainstream

❌could be improved by defining a function. 
important to assign a name to the output

In [None]:
# italian

it_raw = [i for i in plainstream.get_text('it', max_words=500000, tokenize=False)] # 444 articles

# english

en_raw = [i for i in plainstream.get_text('en', max_words=500000, tokenize=False)] # 121 articles

# french

fr_raw = [i for i in plainstream.get_text('fr', max_words=500000, tokenize=False)] 

# german

de_raw = [i for i in plainstream.get_text('de', max_words=500000, tokenize=False)] 

# spanish

es_raw = [i for i in plainstream.get_text('es', max_words=500000, tokenize=False)] 

# polish

# pl_raw = [i for i in plainstream.get_text('pl', max_words=500000, tokenize=False)] 

## WORD LENGTH > 4 corpora

### English corpus

Since the data consists in words taken from Wikipedia articles, a further cleaning is needed in order to remove all the words that include non-english characters

In [None]:
en_clean = tokenizer(en_raw)     # tot tokens 509484

non_en_words = [ word for word in en_clean for el in word if el not in 'abcdefghijklmnopqrstuvwxyz']
en_corpus = [word for word in en_clean if word not in non_en_words]  # len 505849

en_corpus_set = list(set(en_corpus)) # 32694 unique words

# selecting words with at least 4 characters

en_corpus_min4 = [word for word in en_corpus_set if len(word) > 4] # it becomes 29276

# adding the start-end tag

tag_en_corpus = start_end_tags(en_corpus_min4)

# sorting the corpus 

tag_en_corpus.sort(reverse= False)

EN_corpus = tag_en_corpus[:25000]

In [None]:
len(EN_corpus)

25000

### Italian corpus

In [None]:
it_clean = tokenizer(it_raw)      # tot tokens 519712 
len(it_clean)

non_it_words = [ word for word in it_clean for el in word if el not in 'abcdefghijklmnopqrstuvwxyzàèéìòù']
non_it_start = ('aa', 'ee', 'ii', 'uu', 'é', 'è', 'ì', 'à', 'ò', 'ù') 
it_corpus = [word for word in it_clean if word not in non_it_words if word.startswith(non_it_start) == False] # len 510058

IT_corpus = corpus_min_tag(it_corpus, 4, len(EN_corpus))

In [None]:
len(IT_corpus)

25000

### French corpus

In [None]:
fr_clean = tokenizer(fr_raw)     
len(fr_clean)
non_fr_words = [ word for word in fr_clean for el in word if el not in 'abcçdefghijklmnopqrstuvwxyzâæàèéêëîïôœùûüÿ']
fr_corpus = [word for word in fr_clean if word not in non_fr_words] 

FR_corpus = corpus_min_tag(fr_corpus, 4, len(EN_corpus))

### German corpus

In [None]:
de_clean = tokenizer(de_raw)    
len(de_clean)
non_de_words = [ word for word in de_clean for el in word if el not in 'abcdefghijklmnopqrstuvwxyzäöüß']
de_corpus = [word for word in de_clean if word not in non_de_words] 

DE_corpus = corpus_min_tag(de_corpus, 4, len(EN_corpus))

### Spanish corpus


In [None]:
es_clean = tokenizer(es_raw)    

non_es_words = [ word for word in es_clean for el in word if el not in 'abcdefghijklmñnopqrstuüvwxyz']
es_corpus = [word for word in es_clean if word not in non_es_words] 


ES_corpus = corpus_min_tag(es_corpus, 4, len(EN_corpus))

In [None]:
# # SAVING DATA INTO FILE TO SAVE TIME

# import pickle

# with open('EN_corpus', "wb") as fp:   #Pickling
#   pickle.dump(EN_corpus, fp)

# with open('IT_corpus', "wb") as fp:   #Pickling
#   pickle.dump(IT_corpus, fp)

# with open('DE_corpus', "wb") as fp:   #Pickling
#   pickle.dump(DE_corpus, fp)

# with open('FR_corpus', "wb") as fp:   #Pickling
#   pickle.dump(FR_corpus, fp)

# with open('ES_corpus', "wb") as fp:   #Pickling
#   pickle.dump(ES_corpus, fp)

#Corpus already downloaded

In [None]:
# REOPENING THE CORPORA FILES
import pickle

with open("EN_corpus", "rb") as fp:   # Unpickling
  EN_corpus = pickle.load(fp)

with open("IT_corpus", "rb") as fp:   # Unpickling
  IT_corpus = pickle.load(fp)

with open("DE_corpus", "rb") as fp:   # Unpickling
  DE_corpus = pickle.load(fp)

with open("FR_corpus", "rb") as fp:   # Unpickling
  FR_corpus = pickle.load(fp)

with open("ES_corpus", "rb") as fp:   # Unpickling
  ES_corpus = pickle.load(fp)

## FUNCTIONS TO PREPARA DATA FOR TRAINING

In [None]:
# BIGRAMS

def get_bigrams_in_words(words):

  bigrams_in_words = [ ]

  for chr in words:
    bigram = list(zip(chr, chr[ 1 : ]))
    # print(bigram)
    bigrams_in_words.append(bigram)

  return bigrams_in_words

# TRIGRAMS

def get_trigrams_in_words(words):

  trigrams_in_words = [ ]

  for chr in words:
    trigram = list(zip(chr, chr[ 1 : ], chr[2:]))
    # print(bigram)
    trigrams_in_words.append(trigram)

  return trigrams_in_words


# N-GRAMS AS FEATURES

def Ngrams_features(list_word_ngram):
  
  ngrams_as_feat = [ ]

  for wo_ngram in list_word_ngram:
    for ngram in wo_ngram:
      ngrams_as_feat.append(ngram)

  ngrams_as_feat = list(set(ngrams_as_feat)) 

  return ngrams_as_feat


# CREATES CLASS LIST - works if the corpus is ordered

def language_labels(corpus, *labels):
  
  corpus_part = int(len(corpus)/len(labels))
  
  lang_labels = [ ]


  for i in labels:
    lang_labels.extend([i]*corpus_part)

  return lang_labels

# COUNT MATRIX - counts features presence for each row

def count_matrix(all_Ngrams, words_Ngrams):  # takes list of bigrams (features) and word + Ngrams
                      
  features_matrix = np.zeros((len(words_Ngrams), len(all_Ngrams)))  # empty matrix, rows = words in Ngrams, col = Ngrams as features                  
  for Ngram_index, Ngram in enumerate(all_Ngrams):                      
    for word_index, word_Ngrams in enumerate(words_Ngrams):         
      if Ngram in word_Ngrams:                                               
        features_matrix[word_index, Ngram_index] = 1                 # 1 assigned to each Ngram that occurs in the row
  
  return features_matrix


# FUNCTION FOR GENERAL TRAINING - picking classifier, matrix and labels

In [None]:


def model_trainer_tester(clf, X_tr, y_tr, X_test, y_test):
  
  start_time = time.time()
  le = preprocessing.LabelEncoder()
  language_label_encoded = le.fit_transform(y_tr)
  training = clf.fit(X_tr, y_tr)
  cross_score = cross_val_score(clf, X_tr, y_tr, cv=5)
  score_mean = cross_score.mean()
  clf_test = clf.predict(X_test)
  clf_test_ok_guess = 100-round((y_test != clf_test).sum()*100/X_test.shape[0], 2)
  ex_time = (time.time() - start_time)
  return training, score_mean, clf_test_ok_guess, time



def model_trainer_tester2(clf, matrix, labels):

  """ Takes in a matrix, a classifier and a list of labels and train the model
  of the desired classifier with the data, returning the time of execution, the mean accuracy 
  score obtained by cross validation and the
  number of exact guess performed after the test phase. """

  X_train, X_test, y_train, y_test = train_test_split(matrix, labels, test_size=0.2, random_state=42, shuffle=True) # data partition
  start_time = time.time()
  le = preprocessing.LabelEncoder()
  language_label_encoded = le.fit_transform(y_train)
  training = clf.fit(X_train, y_train)
  cross_score = cross_val_score(clf, X_train, y_train, cv=5)  # corss validation
  score_mean = cross_score.mean()
  clf_test = clf.predict(X_test)
  clf_test_ok_guess = 100-round((y_test != clf_test).sum()*100/X_test.shape[0], 2)
  ex_time = (time.time() - start_time)
  return training, score_mean, clf_test_ok_guess, time

# CORPORA 

- languages
- ngrams
- features
- matrices


In [None]:
# FUNCTION TO GET MATRIX

def ngrams_features_count_matrix(corpus):
  bigram_corpus = get_bigrams_in_words(corpus)
  trigram_corpus = get_trigrams_in_words(corpus)
  feat_bi_corpus  = Ngrams_features(bigram_corpus)
  feat_tri_corpus  = Ngrams_features(trigram_corpus)
  if len(feat_bi_corpus) < 10000:
    CM_bi = count_matrix(feat_bi_corpus, bigram_corpus)
  else:
    CM_bi = None
    print('Too many features to build the CM_bi matrix, select some or reduce dimensionality')
  if len(feat_tri_corpus) < 10000:
    CM_tri = count_matrix(feat_tri_corpus, trigram_corpus)
  else:
    CM_tri = None
    print('Too many features, to build the CM_tri matrix, select some or reduce dimensionality')

  return CM_bi, CM_tri

In [None]:
# FUNCTION TO GET ALL POSSIBLE VARIABLES OF MATRIX STARTING FROM A CORPUS 

def ngrams_features_matrix1(corpus):

  # creates bigrams and trigrams from words

  bigram_corpus = get_bigrams_in_words(corpus)
  trigram_corpus = get_trigrams_in_words(corpus)

  # bi and tri transformed in unique features

  feat_bi_corpus  = Ngrams_features(bigram_corpus)
  feat_tri_corpus  = Ngrams_features(trigram_corpus)

  # COUNT MATRIX

  if len(feat_bi_corpus) < 10000:
    CM_bi = count_matrix(feat_bi_corpus, bigram_corpus)
  else:
    CM_bi = None
    print('Too many features to build the CM_bi matrix, select some or reduce dimensionality')
  if len(feat_tri_corpus) < 10000:
    CM_tri = count_matrix(feat_tri_corpus, trigram_corpus)
  else:
    CM_tri = None
    print('Too many features, to build the CM_tri matrix, select some or reduce dimensionality')

  # TF-IDF

  vectorizer_bi = TfidfVectorizer(analyzer='char_wb',ngram_range= (2,2))  # takes in a list and transofrm it in a matrix
  tf_idf_mat_bi = vectorizer_bi.fit_transform(corpus).todense()
  vectorizer_tri = TfidfVectorizer(analyzer='char_wb',ngram_range= (3,3))  # takes in a list and transofrm it in a matrix
  tf_idf_mat_tri = vectorizer_tri.fit_transform(corpus).todense()

  # PCA da mettere ance tfdif

  mat_bi = CM_bi
  mat_tri = CM_tri
  tr_svd = TruncatedSVD(n_components=50)
  if mat_bi != None:
    mat_bi_PCA = tr_svd.fit_transform(mat_bi)
  else:
    print('It was not possible to perform PCA on the bigrams matrix')
  if mat_tri != None:
    mat_tri_PCA = tr_svd.fit_transform(mat_tri)
  else:
    print('It was not possible to perform PCA on the trigrams matrix')


  return CM_bi, CM_tri, tf_idf_mat_bi, tf_idf_mat_tri, mat_bi_PCA, mat_tri_PCA

In [None]:
len(IT_corpus) == len(EN_corpus) == len(DE_corpus) == len(FR_corpus) == len(ES_corpus)

True

# Load saved corpora

## IT - EN

Italian and english corpora are joined

In [None]:
IT_EN_corpus = IT_corpus + EN_corpus

len(IT_EN_corpus)


50000

In [None]:
# lANGUAGE LABELS LIST

IT_EN_labels = language_labels(IT_EN_corpus, 'it', 'en')   # same len as corpus


In [None]:
o, p = ngrams_features_count_matrix(IT_EN_corpus)

Since the features are a lot we must apply a dimensionality reduction.

In [None]:
# with open('IT_EN_bigrams_CM', "wb") as fp:   #Pickling
#   pickle.dump(IT_EN_bigrams_CM, fp)

# with open('IT_EN_trigrams_CM', "wb") as fp:   #Pickling
#   pickle.dump(IT_EN_trigrams_CM, fp)

In [None]:
with open("IT_EN_bigrams_CM", "rb") as fp:   # Unpickling
  IT_EN_bigrams_CM = pickle.load(fp)

with open("IT_EN_trigrams_CM", "rb") as fp:   # Unpickling
  IT_EN_trigrams_CM = pickle.load(fp)

In [None]:
training1, score_mean1, clf_test_ok_guess1, time1 = model_trainer_tester2(MultinomialNB(), IT_EN_trigrams_CM, IT_EN_labels)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
#ATTEMPT WITH PIPELINE


X_train, X_test, y_train, y_test = train_test_split(IT_EN_bigrams_CM, IT_EN_labels, test_size=0.2, random_state=42, shuffle=True) 

# Pipeline & Gridsearch setup
# TFIDF pipeline setup

tvc_pipe = Pipeline( [ ('tvec', TfidfVectorizer()), ('mb', MultinomialNB()) ])

# Randomforest pipeline setup
rf_pipe = Pipeline([('tvec', TfidfVectorizer()),('rf', RandomForestClassifier()) ])
# Fit
tvc_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)
# Setting params for TFIDF Vectorizer gridsearch
tf_params = { 'tvec__max_features' :[100, 2000],
          'tvec__ngram_range' : [(1, 1), (1, 2), (2, 2)],
           'tvec__stop_words' : [None, 'english'],
 
                    }
# Setting up randomforest params
rf_params = {
 'tvec__max_features':[2000],
 'tvec__ngram_range': [(1, 2)],
 'tvec__stop_words': ['english'],
 'rf__max_depth': [1000],
 'rf__min_samples_split': [100],
 'rf__max_leaf_nodes': [None]
}


### IMPLEMENTING GRID SEARCH

# Setting up GridSearch for Randomforest
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv = 5, verbose = 1, n_jobs = -1)
# Setting up GridSearch for TFIDFVectorizer
tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 5, verbose =1, n_jobs = -1)
# Fitting TVC GS
tvc_gs.fit(X_train, y_train)
# Fitting Randomforest CV GS
rf_gs.fit(X_train, y_train)



### SCORIGNT HE MODELS

# Scoring Training data on TFIDFVectorizer
tvc_gs.score(X_train, y_train)

# Scoring Test data on TFIDFVectorizer
tvc_gs.score(X_test, y_test)

# Scoring Training data on RandomForest
rf_gs.score(X_train, y_train)

# Checking Test score on RandomForest
rf_gs.score(X_test, y_test)


AttributeError: ignored

## IT - EN - DE

In [None]:
IT_EN_DE_corpus = IT_corpus + EN_corpus + DE_corpus # len 75000

In [None]:
IT_EN_DE_labels = language_labels(IT_EN_DE_corpus, 'it', 'en', 'de')

In [None]:
# with open('IT_EN_DE_corpus', "wb") as fp:   #Pickling
#   pickle.dump(IT_EN_DE_corpus, fp)

In [None]:
IT_EN_DE_bigrams_CM, IT_EN_DE_trigrams_CM = ngrams_features_matrix(IT_EN_DE_corpus)

Too many features, select some or reduce dimensionality


In [None]:
with open("IT_EN_bigrams_CM", "rb") as fp:   # Unpickling
  IT_EN_bigrams_CM = pickle.load(fp)

# with open("IT_EN_trigrams_CM", "rb") as fp:   # Unpickling
#   IT_EN_trigrams_CM = pickle.load(fp)

## IT - EN - DE - FR

In [None]:
IT_EN_DE_FR_corpus = IT_corpus + EN_corpus + DE_corpus + FR_corpus

len(IT_EN_DE_FR_corpus)

100000

In [None]:
IT_EN_DE_FR_labels = language_labels(IT_EN_DE_FR_corpus, 'it', 'en', 'de', 'fr')

In [None]:
# with open('IT_EN_DE_FR_corpus', "wb") as fp:   #Pickling
#   pickle.dump(IT_EN_DE_FR_corpus, fp)

In [None]:
IT_EN_DE_FR_bigrams_CM, IT_EN_DE_FR_trigrams_CM = ngrams_features_matrix(IT_EN_DE_FR_corpus)

Too many features, to build the CM_tri matrix, select some or reduce dimensionality


In [None]:
IT_EN_DE_FR_bigrams_CM.shape

(100000, 1211)

In [None]:
# with open('IT_EN_DE_FR_bigrams_CM', "wb") as fp:   #Pickling
#   pickle.dump(IT_EN_DE_FR_bigrams_CM, fp)

In [None]:
with open("IT_EN_DE_FR_bigrams_CM", "rb") as fp:   # Unpickling
  IT_EN_DE_FR_bigrams_CM = pickle.load(fp)

# Preparation for the training



# BIGRAMS - PCA - IT-EN

In [None]:
IT_EN_bigrams_matrix.shape

(58588, 818)

In [None]:
tr_svd = TruncatedSVD(n_components=10)
matPCA = tr_svd.fit_transform(provamat)

In [None]:
matPCA.shape

(58588, 10)

In [None]:
# Training preparation with PCA matrix

X_trainPCA, X_testPCA, y_trainPCA, y_testPCA = train_test_split(matPCA, IT_EN_labels, test_size=0.2, random_state=42, shuffle=True)

In [None]:

RF_PCA, RF_PCA_score, PCA_RF_clf_test_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100),X_trainPCA, y_trainPCA, X_testPCA, y_testPCA)
NB_PCA, NB_PCA_score, PCA_NB_clf_test_ok_guess = model_trainer_tester(GaussianNB(),X_trainPCA, y_trainPCA, X_testPCA, y_testPCA)
LR_PCA, LR_PCA_score, PCA_LR_clf_test_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'),X_trainPCA, y_trainPCA, X_testPCA, y_testPCA)
SVM_linear_PCA, SVM_linear_PCA_score, PCA_SVM_linear_clf_test_ok_guess = model_trainer_tester(svm.LinearSVC(),X_trainPCA, y_trainPCA, X_testPCA, y_testPCA)
SVM_sig_PCA, SVM_sig_PCA_score, PCA_SVM_sig_clf_test_ok_guess = model_trainer_tester(SVC(kernel='sigmoid'),X_trainPCA, y_trainPCA, X_testPCA, y_testPCA)
SVM_RBF_PCA, SVM_RBF_PCA_score, PCA_SVM_RBF_clf_test_ok_guess = model_trainer_tester(SVC(kernel='rbf'),X_trainPCA, y_trainPCA, X_testPCA, y_testPCA)

In [None]:
m1 = ['Naive Bayesian', 'PCA', 'bigrams', NB_PCA_score, PCA_NB_clf_test_ok_guess, 'it-en']
m2 = ['Random Forest', 'PCA', 'bigrams', RF_PCA_score,PCA_RF_clf_test_ok_guess, 'it-en']
m3 = ['Logistic Regression', 'PCA', 'bigrams', LR_PCA_score, PCA_LR_clf_test_ok_guess, 'it-en']
m4 = ['SVM_linear', 'PCA', 'bigrams', SVM_linear_PCA_score, PCA_SVM_linear_clf_test_ok_guess, 'it-en']
m5 = ['SVM_sig', 'PCA', 'bigrams', SVM_sig_PCA_score, PCA_SVM_sig_clf_test_ok_guess, 'it-en']
m6 = ['SVM_RBF', 'PCA', 'bigrams', SVM_RBF_PCA_score, PCA_SVM_RBF_clf_test_ok_guess, 'it-en']


models_to_compare_PCA = [m1] + [m2] + [m3] + [m4] + [m5] + [m6]
#print(models_to_compare)

clf_comparison_bigrams_PCA = pd.DataFrame(models_to_compare_PCA, columns = ['MODEL', 'MATRIX', 'N-GRAMS', 'ACCURACY', 'TEST', 'LANGUAGES'])
clf_comparison_bigrams_PCA

Unnamed: 0,MODEL,MATRIX,N-GRAMS,ACCURACY,TEST,LANGUAGES
0,Naive Bayesian,PCA,bigrams,0.708513,70.77,it-en
1,Random Forest,PCA,bigrams,0.762108,76.33,it-en
2,Logistic Regression,PCA,bigrams,0.722125,72.57,it-en
3,SVM_linear,PCA,bigrams,0.722808,72.6,it-en
4,SVM_sig,PCA,bigrams,0.558033,56.81,it-en
5,SVM_RBF,PCA,bigrams,0.780201,78.41,it-en


# BIGRAMS - COUNT MATRIX - IT-EN

In [None]:

MNB_count_matrix, MNB_count_matrix_score, MNB_clf_test_ok_guess = model_trainer_tester(MultinomialNB(),X_train, y_train, X_test, y_test)

NB_count_matrix, NB_count_matrix_score, NB_clf_test_ok_guess = model_trainer_tester(GaussianNB(),X_train, y_train, X_test, y_test)
RF_count_matrix, RF_count_matrix_score, RF_clf_test_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100),X_train, y_train, X_test, y_test)
LR_count_matrix, LR_count_matrix_score, LR_clf_test_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'),X_train, y_train, X_test, y_test)
SVM_linear_count_matrix, SVM_linear_count_matrix_score, SVM_linear_clf_test_ok_guess = model_trainer_tester(svm.LinearSVC(),X_train, y_train, X_test, y_test)
# SVM_sig_count_matrix, SVM_sig_count_matrix_score, SVM_sig_clf_test_ok_guess = model_trainer_tester(SVC(kernel='sigmoid'),X_train, y_train, X_test, y_test)
SVM_RBF_count_matrix, SVM_RBF_count_matrix_score, SVM_RBF_clf_test_ok_guess = model_trainer_tester(SVC(kernel='rbf'),X_train, y_train, X_test, y_test)

### Comparison between algorithms performances for bigrams in count matrix

In [None]:
model0 =  ['Multinomial Naive Bayesian', 'Count matrix', 'bigrams', MNB_count_matrix_score , MNB_clf_test_ok_guess, 'it-en']
model1 = ['Naive Bayesian', 'Count matrix', 'bigrams', NB_count_matrix_score , NB_clf_test_ok_guess, 'it-en']
model2 = ['Random Forest', 'Count matrix', 'bigrams', RF_count_matrix_score , RF_clf_test_ok_guess, 'it-en']
model3 = ['Logistic Regression', 'Count matrix', 'bigrams', LR_count_matrix_score , LR_clf_test_ok_guess, 'it-en']
model4 = ['SVM_linear', 'Count matrix', 'bigrams', SVM_linear_count_matrix_score, SVM_linear_clf_test_ok_guess, 'it-en']
model5 = ['SVM_sig', 'Count matrix', 'bigrams', SVM_sig_count_matrix_score , SVM_sig_clf_test_ok_guess, 'it-en']
model6 = ['SVM_RBF', 'Count matrix', 'bigrams', SVM_RBF_count_matrix_score , SVM_RBF_clf_test_ok_guess, 'it-en']


models_to_compare = [model0] + [model1] + [model2] + [model3] + [model4] + [model5] + [model6]
#print(models_to_compare)

clf_comparison_bigrams_CM = pd.DataFrame(models_to_compare, columns = ['MODEL', 'MATRIX', 'N-GRAMS', 'ACCURACY', 'TEST', 'LANGUAGES'])
clf_comparison_bigrams_CM

Unnamed: 0,MODEL,MATRIX,N-GRAMS,ACCURACY,TEST,LANGUAGES
0,Multinomial Naive Bayesian,Count matrix,bigrams,0.84218,84.28,it-en
1,Naive Bayesian,Count matrix,bigrams,0.539812,54.37,it-en
2,Random Forest,Count matrix,bigrams,0.845253,84.25,it-en
3,Logistic Regression,Count matrix,bigrams,0.858993,85.81,it-en
4,SVM_linear,Count matrix,bigrams,0.858822,85.76,it-en
5,SVM_sig,Count matrix,bigrams,0.768573,76.22,it-en
6,SVM_RBF,Count matrix,bigrams,0.866866,86.86,it-en


# BIGRAMS - TF-IDF - IT-EN

In [None]:
vectorizer = TfidfVectorizer(analyzer='char_wb',ngram_range= (2,2))  # takes in a list and transofrm it in a matrix
tf_idf_matrix = vectorizer.fit_transform(IT_EN_corpus).todense()

In [None]:
tf_idf_matrix.shape

(58588, 809)

In [None]:
# Splitting the matrix into train and test

tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test = train_test_split(tf_idf_matrix, IT_EN_labels, test_size=0.2, random_state=42, shuffle=True)

In [None]:
NB_tfidf, NB_tfidf_score, NB_tfidf_ok_guess = model_trainer_tester(GaussianNB(),tfidf_X_train, tfidf_y_train, tfidf_X_test, tfidf_y_test)
RF_tfidf, RF_tfidf_score, RF_tfidf_ok_guess = model_trainer_tester(RandomForestClassifier(n_estimators=100),tfidf_X_train, tfidf_y_train, tfidf_X_test, tfidf_y_test)
LR_tfidf, LR_tfidf_score, LR_tfidf_ok_guess = model_trainer_tester(LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr'),tfidf_X_train, tfidf_y_train, tfidf_X_test, tfidf_y_test)
SVM_linear_tfidf, SVM_linear_tfidf_score, SVM_linear_tfidf_ok_guess = model_trainer_tester(svm.LinearSVC(),tfidf_X_train, tfidf_y_train, tfidf_X_test, tfidf_y_test)
SVM_sig_tfidf, SVM_sig_tfidf_score, SVM_sig_tfidf_ok_guess = model_trainer_tester(SVC(kernel='sigmoid'),tfidf_X_train, tfidf_y_train, tfidf_X_test, tfidf_y_test)


In [None]:
SVM_RBF_tfidf, SVM_RBF_tfidf_score, SVM_RBF_tfidf_ok_guess = model_trainer_tester(SVC(kernel='rbf'),tfidf_X_train, tfidf_y_train, tfidf_X_test, tfidf_y_test) # to run

In [None]:
el7 = ['Naive Bayesian', 'tf-idf', 'bigrams', NB_tfidf_score, NB_tfidf_ok_guess, 'it-en']
el8 = ['Random Forest', 'tf-idf', 'bigrams', RF_tfidf_score , RF_tfidf_ok_guess, 'it-en']
el9 = ['Logistic Regression','tf-idf', 'bigrams', LR_tfidf_score , LR_tfidf_ok_guess, 'it-en']
el10 = ['SVM_linear', 'tf-idf', 'bigrams', SVM_linear_tfidf_score, SVM_linear_tfidf_ok_guess, 'it-en']
el11 = ['SVM_sig', 'tf-idf', 'bigrams', SVM_sig_tfidf_score, SVM_sig_tfidf_ok_guess, 'it-en']

models_to_compareTF = [el7] + [el8] + [el9] + [el10] + [el11] 
#print(models_to_compare)

clf_comparison_bigrams_tfidf = pd.DataFrame(models_to_compareTF, columns = ['MODEL', 'MATRIX', 'N-GRAMS', 'ACCURACY', 'TEST', 'LANGUAGES'])

models_to_compareTF.to_csv('performancetfidf.csv', index=False)

## Results table

In [None]:
# accuracy score in comparison

model1 = ['Naive Bayesian', 'Count matrix', 'bigrams', NB_count_matrix_score.mean() , NB_clf_test_ok_guess]
model2 = ['Random Forest', 'Count matrix', 'bigrams', RF_count_matrix_score.mean() , RF_clf_test_ok_guess]
model3 = ['Logistic Regression', 'Count matrix', 'bigrams', LR_count_matrix_score.mean() , LR_clf_test_ok_guess]
model4 = ['SVM_linear', 'Count matrix', 'bigrams', SVM_linear_count_matrix_score.mean() , SVM_linear_count_matrix_test]
model5 = ['SVM_sig', 'Count matrix', 'bigrams', SVM_sig_count_matrix_score.mean() , SVM_sig_count_matrix_test]
model6 = ['SVM_RBF', 'Count matrix', 'bigrams', SVM_RBF_count_matrix_score.mean() , SVM_RBF_count_matrix_test]
model7 = ['Naive Bayesian', 'tf-idf matrix', 'bigrams', NB_tfidf_score.mean() , NB_tfidf_ok_guess]
model8 = ['Random Forest', 'tf-idf matrix', 'bigrams', RF_tfidf_score.mean() , RF_tfidf_ok_guess]
model9 = ['Logistic Regression', 'tf-idf', 'bigrams', LR_tfidf_score.mean() , LR_tfidf_ok_guess]
model10 = ['SVM_linear', 'tf-idf', 'bigrams', SVM_linear_tfidf_score.mean() , SVM_linear_ok_guess]
model11 = ['SVM_sig', 'tf-idf', 'bigrams', SVM_sig_tfidf_score.mean() , SVM_sig_ok_guess]
model12 = ['SVM_RBF', 'tf-idf', 'bigrams', SVM_RBF_tfidf_score.mean() , SVM_RBF_ok_guess]

models_to_compare = [model1] + [model2] + [model3] + [model4] + [model5] + [model6] + [model7] + [model8] + [model9] + [model10] + [model11] + [model12]
#print(models_to_compare)

clf_comparison = pd.DataFrame(models_to_compare, columns = ['MODEL', 'MATRIX', 'N-GRAMS', 'ACCURACY', 'TEST'])

clf_comparison

Unnamed: 0,MODEL,MATRIX,N-GRAMS,ACCURACY,TEST
0,Naive Bayesian,Count matrix,bigrams,0.543397,55.06
1,Random Forest,Count matrix,bigrams,0.847344,84.28
2,Linear Regression,Count matrix,bigrams,0.857841,85.73
3,SVM,Count matrix,bigrams,0.857798,85.65
4,Naive Bayesian,tf-idf matrix,bigrams,0.540324,54.62
5,Random Forest,tf-idf matrix,bigrams,0.844165,83.85
6,Linear Regression,tf-idf,bigrams,0.85863,85.88


# Neologisms list

In [None]:


italian_neologisms_list = [ ]

with open('/content/drive/MyDrive/MACHINE_LEARNING/italian_neologisms_list.txt') as f:
    for line in f:
      italian_neologisms_list.append(line.rstrip('\n'))

# italian_neologisms_list

In [None]:
sample_iten_neologisms = random.sample(italian_neologisms_list, 100) # to test the models on a random sample of 20 neologims

In [None]:
sample_iten_neologisms

## 5.3. Testing the models on the neologisms

Functions 

In [None]:
# makes the results of the tests readable

def results_labels(test, test_words):
  Test = list(test)
  analysis_results = [[word, el] for word, el in zip(test_words, Test)]

  return analysis_results

Test with count matrix

In [None]:


iten_neologisms_list = get_bigrams_in_words(sample_iten_neologisms)   # transforms neologisms in lists composed by bigrams
iten_neol_matrix = generate_matrix(feat_bi, iten_neologisms_list)     # creates matrix with features and words as bigrams

# Test on Bayesian model - count_matrix

NB_iten_neol =  NB_clf.predict(iten_neol_matrix)

# Test on Random Forest - count_matrix

RF_iten_neol = RF_clf.predict(iten_neol_matrix)   




In [None]:
iten_neol_matrix.shape

(100, 807)

In [None]:
NB_iten_neol

In [None]:
# checking if number of features is the same in training set and new test set

IT_EN_bigrams_matrix.shape[1] == iten_neol_matrix.shape[1]

True

In [None]:
# GENERATING A TF-IDF MATRIX

# we use the transformer module because we get a different number of features from the list.
# If we feed it with the count matrix we already have we will have the same number of features

tfidf_transformer = TfidfTransformer(smooth_idf=False) # not the vectorizer, it requires to be fed with a count based matrix
iten_neol_tfidf_matrix = tfidf_transformer.fit_transform(iten_neol_matrix)



  idf = np.log(n_samples / df) + 1


In [None]:
# Test on Bayes_2_tfidf - version 2

NB_iten_neol_TFIDF = NB_clf.predict(iten_neol_tfidf_matrix.toarray())

# Test on RF_2_tfidf_test - version 2

RF_iten_neol_TFIDF = RF_clf.predict(iten_neol_tfidf_matrix.toarray())

In [None]:
NB_iten_neol_TFIDF

array(['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'it', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'it', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'it', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en'], dtype='<U2')

In [None]:
RF_iten_neol_TFIDF

array(['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en',
       'en'], dtype='<U2')

Results

In [None]:
# COMPARISON BETWEEN COUNT-MATRIX CLASSIFICATION

NEO_model1 = NB_iten_neol # 'Naive Bayesian', 'Count matrix'
NEO_model2 = RF_iten_neol # 'Random Forest', 'Count matrix'
NEO_model3 = NB_iten_neol_TFIDF # 'Naive Bayesian', 'TFIDF'
NEO_model4 = RF_iten_neol_TFIDF # 'Random Forest', 'TFIDF'



neol_comparison = pd.DataFrame(columns = [ 'WORDS', 'NB_bi_CM', 'RF_bi_CM', 'NB_bi_TFIDF', 'RF_bi_TFIDF']) #, '', 'RF_bi_TFIDF' ]) 

neol_comparison['WORDS'] = sample_iten_neologisms
neol_comparison['NB_bi_CM'] = NEO_model1
neol_comparison['RF_bi_CM'] = NEO_model2
neol_comparison['NB_bi_TFIDF'] = NEO_model3
neol_comparison['RF_bi_TFIDF'] = NEO_model4



neol_comparison



Unnamed: 0,WORDS,NB_bi_CM,RF_bi_CM,NB_bi_TFIDF,RF_bi_TFIDF
0,filostatunitense,en,it,en,en
1,salamellato,en,it,en,en
2,redeployment,en,en,en,en
3,pezzaculista,it,it,en,en
4,carrirer,en,it,en,en
...,...,...,...,...,...
95,double-dipper,en,en,en,en
96,anti-trivelle,en,it,en,en
97,Webcam-mania,en,en,en,en
98,eurodracma,en,en,en,en
