# **Use custom pretrained embeddings with sklearn**





# Import libraries

In [None]:
!pip install -U spacy
!pip install -U gensim
!pip install scikit-multilearn
#pip install neattext



In [None]:
# Importing the necessary libraries

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
pio.renderers.default = 'colab'
import joblib
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import random
from datetime import datetime
from pathlib import Path
from scipy.sparse import hstack
from collections import Counter, OrderedDict

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,classification_report
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
import skmultilearn
# learning Curves
from sklearn.model_selection import learning_curve

# draws a confusion matrix
from sklearn.metrics import plot_confusion_matrix 

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim

import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_folder = Path('/content/drive/MyDrive/NLP/Homework21')

In [None]:
# specify the folder in googe drive where we will save dataset
basepath = '/content/drive/MyDrive/Data'

In [None]:
lecture_folder = Path('/content/drive/NLP')

In [None]:
save_model_folder = lecture_folder /'saved_model'

In [None]:
spacy_folder = Path('/content/drive/MyDrive/Data/spacy/')

In [None]:
#url = 'https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz'
#!wget {url} -P {spacy_folder} 

In [None]:
#import tarfile
#file = spacy_folder / 'en_core_web_lg-3.2.0.tar.gz'
#with  tarfile.open(file, 'r') as tar:
#  tar.extractall(path = spacy_folder)

In [None]:
model = spacy_folder /'en_core_web_lg-3.2.0'/'en_core_web_lg'/'en_core_web_lg-3.2.0'
nlp = spacy.load(model, disable=['ner, parser'])

In [None]:
#Checking spacy and gensim version.
# We will use latest versions.
print(f'spacy: {spacy.__version__}, gensim {gensim.__version__}')

spacy: 3.2.0, gensim 4.1.2


# Functions/Classes

## Learning Curves

Function for learning curves: The function below has been taken from sklearn official documentation: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py

In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 2 plots: the test and training learning curve, the training
    samples vs fit times curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 2, figsize=(10, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True,
                       random_state=123)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    return plt

## PreProcessor

In [None]:
class SpacyPreprocessor(BaseEstimator, TransformerMixin):
    np.random.seed(0)
    def __init__(self, lammetize=True, lower=True, remove_stop=True, remove_punct=True, remove_num=False):
        self.remove_stop = remove_stop
        self.remove_punct = remove_punct
        self.remove_num = remove_num
        self. lammetize = lammetize
        self.lower = lower

    # helpfer functions for basic cleaning 
    def basic_clean(self,text):
        return [re.sub(r'[\n\r]',' ',sentence) for sentence in text]

    # helper function for pre-processing with spacy
    def spacy_preprocessor(self,texts): 
        nlp=spacy.load(model, disable=['parser','ner'])
        ## Add @ as a prefix so that we can separate the word from its token
        ## Since we are using pretrained vectors - @ mentions will be different in the pre-trained vocab
        
        prefixes = list(nlp.Defaults.prefixes)
        prefixes += ['@']
        prefix_regex = spacy.util.compile_prefix_regex(prefixes)
        nlp.tokenizer.prefix_search = prefix_regex.search
     
        matcher = Matcher(nlp.vocab)
        if self.remove_stop:
            matcher.add("stop_words", [[{"is_stop" : True}]])
        if self.remove_punct:
            matcher.add("punctuation",[ [{"is_punct": True}]])
        if self.remove_num:
            matcher.add("numbers", [[{"like_num": True}]])
        Token.set_extension('is_remove', default=False,force=True)
        cleaned_text=[]

        for doc in nlp.pipe(texts,batch_size=64,disable=['parser','ner']):
            matches = matcher(doc)
            for _, start, end in matches:
                for token in doc[start:end]:
                    token._.is_remove =True
                    
            if self.lammetize:
                text = ' '.join(token.lemma_ for token in doc if (token._.is_remove==False))
            else:
                text = ' '.join(token.text for token in doc if (token._.is_remove==False))
            if self.lower:
                text=text.lower()
            cleaned_text.append(text)
        return cleaned_text

    def fit(self, X,y=None):
        return self

    def transform(self, X,y=None):
        x_clean = self.basic_clean(X)
        x_clean_final = self.spacy_preprocessor(x_clean)
        return x_clean_final

## Gensim vectorizer

In [None]:
class GensimVectorizer(BaseEstimator,TransformerMixin):
  np.random.seed(0)
  def __init__(self,pretrained_vectors,unk_norm_init=False):
    # load in pre-trained word vectors
    self.pretrained_vectors= pretrained_vectors
    self.vec_size= self.pretrained_vectors.vector_size
    self.unk_norm_init = unk_norm_init
    self.pretrained_vectors_subset = {}
    self.words_not_in_pretrained = []
    self.count_missing = 0
    self.percent_missing = 0


  def fit(self, X,y=None):
    '''
    Gets the subset of pretrained vectors which are present in vocab
    X :  training sentences
    '''
    counter = Counter()

    for sent in X:
        counter.update(sent.split())
    for token in counter:
        try:
            self.pretrained_vectors_subset[token] = self.pretrained_vectors.get_vector(token, norm=True)
        except:
            self.words_not_in_pretrained.append(token)
    
    ### save so that you can access this after you fit the vectorizer
    self.count_missing = len(self.words_not_in_pretrained )
    self.percent_missing = self.count_missing / len(counter)
    return self
    
  def transform(self,X,y=None):
    X_vector = np.zeros((len(X), self.vec_size))
    
    for i, sent in enumerate(X):
        sent_vector= np.zeros(self.vec_size)
        n=0
        tokens = sent.split()
        for word in tokens:
            if word in self.pretrained_vectors_subset.keys():
                word_vector=self.pretrained_vectors_subset[word]
                sent_vector+= word_vector
                n+= 1
            else:
                if self.unk_norm_init :
                    word_vector = np.random.normal(size=  self.vec_size)
                    sent_vector+= word_vector
                    n+= 1
        if n>0:
            X_vector[i] = sent_vector/n
    return X_vector

# **Multilabel_classification dataset Stackflow exchange**

## Train/Test/Valid Dataset Importing cleaned data

In [None]:
# specify pathlib folder
# This is a system Path(PosixPath)
folder = Path(basepath)
folder

PosixPath('/content/drive/MyDrive/Data')

In [None]:
#file_csv = folder / 'multilabel_hw.csv'

In [None]:
df = pd.read_csv(folder / 'multilabel_hw.csv')
df.drop(columns= ['Unnamed: 0','Unnamed: 0.1','Id'], axis=1, inplace=True)


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
df1 = pd.DataFrame(mlb.fit_transform(df['Tag_Number']),columns=mlb.classes_)
cols = [0,1,12,13]
df1.drop(df1.columns[cols],axis=1,inplace=True)
df_final = pd.concat([df,df1],axis=1)
df_final = df_final[:1000]

In [None]:
X = df_final['Body'].values #drop(columns = ['Body','Tag','Tag_Number'])

y = df_final[['0','1','2','3','4','5','6','7','8','9']].values
#y = y.reshape(1,-1)   #'0','1','2','3','4','5','6','7','8','9'

# Create train/test/valid


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [None]:
preprocessor = SpacyPreprocessor()

In [None]:
X_train_cleaned= preprocessor.fit_transform(X_train)

In [None]:
X_test_cleaned = preprocessor.transform(X_test)

In [None]:
X_train_cleaned = np.array(X_train_cleaned)
X_test_cleaned = np.array(X_test_cleaned)

In [None]:
file_X_train_cleaned_data = data_folder/ 'X_train_multiclass_clean_task4_1.pkl'

In [None]:
file_X_test_cleaned_data = data_folder/'X_test_multiclass_clean_task4_1.pkl'

In [None]:
joblib.dump(X_train_cleaned, file_X_train_cleaned_data) 
joblib.dump(X_test_cleaned, file_X_test_cleaned_data) 

['/content/drive/MyDrive/NLP/Homework21/X_test_multiclass_clean_task4_1.pkl']

In [None]:
# location of train and test files
file_X_train_cleaned_data = data_folder /'X_train_multiclass_clean_task4_1.pkl'
file_X_test_cleaned_data = data_folder /'X_test_multiclass_clean_task4_1.pkl'

In [None]:
X_train_cleaned = joblib.load(file_X_train_cleaned_data)
X_test_cleaned = joblib.load(file_X_test_cleaned_data)

# Classification Pipeline

In [None]:
pretrained_vectors = KeyedVectors.load('/content/drive/MyDrive/NLP/Homework21/model_df.bin')

In [None]:
#Creating sklearn pipeline and fitting train data
pipeline = Pipeline([
               ('vectorizer',GensimVectorizer(pretrained_vectors)),
               ('classifier',BinaryRelevance(LogisticRegression(max_iter = 1000)))
                ])

In [None]:
#vectorizer = GensimVectorizer(pretrained_vectors)

In [None]:
#Xfeature = vectorizer.fit_transform(X_train_cleaned)

In [None]:
#type(Xfeature[0])

In [None]:
#model = ClassifierChain(MultinomialNB())

In [None]:
#from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
#Xfeature = scaler.fit_transform(Xfeature)
#X_test = scaler.fit_transform(X_test)

In [None]:
#model.fit(Xfeature,y_train)

## Hyperparamter Tuniung Round1

In [None]:
# now we create the grid with all the parameters that we would like to test

param_grid_1 = {
    'classifier__classifier__C': [100000]
    #'classifier__classifier__max_iter':[100]
    }

# now we set up the grid search with cross-validation
grid_logreg_1 = GridSearchCV(pipeline, param_grid_1,
                           cv=5, return_train_score= True, n_jobs=-1 )

In [None]:
#X_train_cleaned

In [None]:
#pipeline.fit(X_train_cleaned,y_train)

In [None]:
grid_logreg_1.fit(X_train_cleaned,y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer',
                                        GensimVectorizer(pretrained_vectors=<gensim.models.fasttext.FastTextKeyedVectors object at 0x7fdf26d4d090>)),
                                       ('classifier',
                                        BinaryRelevance(classifier=LogisticRegression(max_iter=1000),
                                                        require_dense=[True,
                                                                       True]))]),
             n_jobs=-1, param_grid={'classifier__classifier__C': [100000]},
             return_train_score=True)

In [None]:
#Let's check the best_parameters from GridSearchCv for our model
print(grid_logreg_1.best_params_)

{'classifier__classifier__C': 100000}


In [None]:
plot_learning_curve(grid_logreg_1.best_estimator_, 'Learning Curves logreg', X_train, y_train, n_jobs=-1)

RAM is crashing here at learning curve function

In [None]:
#let's check the train scores
print(grid_logreg_1.score(X_train_cleaned,y_train))

#let's check the cross validation score
print(grid_logreg_1.best_score_)

In [None]:
#let's check the test scores
print(grid_logreg_1.score(X_test_cleaned,y_test))

In [None]:
plot_confusion_matrix(grid_logreg_1.best_estimator_, X_test_cleaned, y_test,
                                
                                 cmap=plt.cm.Blues,
                                 normalize = 'true')
plt.grid(False)
plt.show()