# Session 06 

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn import metrics
# import tensorflow as tf
# import matplotlib.pyplot as plt
# from sklearn.metrics import r2_score
# from tensorflow.python.keras.layers.core import Activation

# skip_plot =5  ### Plot strides
# import tensorflow as tf
# import matplotlib.pyplot as plt


## Transformer /huggingface *library*

In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
!pip install transformers >/dev/null

### transformers Pipeline 
In the Transformers package, the pipeline is a wrapper class which preprocess input, predicts and post process output for other pipelines like Named Entity Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction, Question Answering, etc.

<code>  pipeline(
                    'task_name',
                    model ='model_name',
                    tokenizer 
                )
</code>
<br>Some of aviailable models

    feature-extraction (get the vector representation of a text)
    fill-mask
    ner (named entity recognition)
    question-answering
    sentiment-analysis
    summarization
    text-generation
    translation
    zero-shot-classification

In [2]:
from transformers import pipeline

### Sentiment Analysis

In [3]:
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [4]:
results = classifier("We are were happy wiht course content")
print(results)
#######################################
# exercise 1 Change above text to get  #
# a normal response                   #
#######################################


[{'label': 'POSITIVE', 'score': 0.9996949434280396}]


In [5]:
sentiment_output = results[0]['label']
sentiment_score = results[0]['score']
print(f'Sentiment is: {sentiment_output} and its score: {sentiment_score}')

Sentiment is: POSITIVE and its score: 0.9996949434280396


### Question-answer NLP example

In [6]:
question_answer = pipeline('question-answering')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [13]:
q_a = question_answer({
    'question': 'Where is pindi ?',
    'context':'''Rawalpindi (pronounced [raːwəlˈpɪndi] (About this soundlisten) pɪndi/;[3] Punjabi / Urdu: راولپنڈی, romanized: 
    Rāwalpinḍī ), colloquially known as Pindi (Punjabi: پݨڈی, romanized: Piṇḍī ), is the capital city of Rawalpindi Division located in the Punjab province of Pakistan. 
    Rawalpindi is the fourth-largest city proper in Pakistan after Karachi, 
    Lahore and Faisalabad respectively while the larger Islamabad-Rawalpindi 
    metropolitan area is the country's third largest metropolitan area. 
    Rawalpindi is adjacent to Pakistan's capital of Islamabad, and the two are jointly 
    known as the "twin cities" on account of strong social and economic links between the cities.'''
})
print(q_a)

#######################################
# exercise 1 Change above text to get  #
# answer for your question            #
#######################################

{'score': 0.7103636860847473, 'start': 252, 'end': 279, 'answer': 'Punjab province of Pakistan'}


In [14]:
print('The answer is', q_a['answer'])

The answer is Punjab province of Pakistan


### Text Summarization

In [None]:
summary_ext = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
text = """This is a text summary test. We are going to see in this course if the text 
can be summarize efficiently. This section of IST course is about the NLP (natural language processing). In this course of AI which brings together 
computer science and statistics to harness that predictive power. It’s a must-have skill for all aspiring data analysts and data scientists, or anyone else who wants to wrestle all that raw data into refined trends and predictions."""

result = summary_ext(text)
print(result)

### Fill in the blank document processing

In [None]:
mask_complete = pipeline('fill-mask',model='bert-base-uncased')

In [None]:
mask_complete("Aoa, i like to develop [MASK] model.")
#######################################
# exercise 1 try multiple Mask        #
#######################################

# Tips and Advance concepts 👇



## Custom Auto text completion.

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np 

%load_ext tensorboard
import tensorflow as tf
import datetime, os
!mkdir logs

In [None]:
tokenizer = Tokenizer()
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \
    -O /tmp/sonnets.txt
data = open('/tmp/sonnets.txt').read()

corpus = data.lower().split("\n")


tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# create input sequences using list of tokens
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)


# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

label = ku.to_categorical(label, num_classes=total_words)

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
callbacks = [
            tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
]
model.fit(predictors, label, epochs=25, verbose=2, callbacks=callbacks)

In [None]:
%tensorboard --logdir logs

In [None]:
def auto_complete(seed_text, next_words):
	for _ in range(next_words):
		token_list = tokenizer.texts_to_sequences([seed_text])[0]
		token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
		predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
		output_word = ""
		for word, index in tokenizer.word_index.items():
			if index == predicted:
				output_word = word
				break
		seed_text += " " + output_word
	return seed_text

print(auto_complete("towrad the end of era", 5))

In [None]:
print(auto_complete("i will be back", 10))