# Examen Final de Procesamiento Inteligente de Texto

In [None]:
#!pip install tensorflow-text. # !pip install pip install PyMuPDF #https://www.tensorflow.org/tutorials/load_data/text

# Predict the programming language for a Stack Overflow question

In [None]:
import collections
import pathlib
import re
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow_datasets as tfds
import tensorflow_text as tf_text

### Load dataset

In [None]:
data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
dataset = utils.get_file('stack_overflow_16k.tar.gz', data_url, untar=True, cache_dir='stack_overflow', cache_subdir='')

dataset_dir = pathlib.Path(dataset).parent
train_dir = dataset_dir/'train'
list(train_dir.iterdir())

[PosixPath('/tmp/.keras/train/csharp'),
 PosixPath('/tmp/.keras/train/javascript'),
 PosixPath('/tmp/.keras/train/python'),
 PosixPath('/tmp/.keras/train/java')]

In [None]:
batch_size = 32
seed = 42

raw_train_ds = preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

raw_val_ds = preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

test_dir = dataset_dir/'test'
raw_test_ds = preprocessing.text_dataset_from_directory(test_dir, batch_size=batch_size)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


Explore dataset

In [None]:
for i, label in enumerate(raw_train_ds.class_names):
  print("Label", i, "corresponds to", label)

for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(5):
    print("Question: ", text_batch.numpy()[i][:100], '...')
    print("Label:", label_batch.numpy()[i])

Label 0 corresponds to csharp
Label 1 corresponds to java
Label 2 corresponds to javascript
Label 3 corresponds to python
Question:  b'"my tester is going to the wrong constructor i am new to programming so if i ask a question that can' ...
Label: 1
Question:  b'"blank code slow skin detection this code changes the color space to lab and using a threshold finds' ...
Label: 3
Question:  b'"option and validation in blank i want to add a new option on my system where i want to add two text' ...
Label: 1
Question:  b'"exception: dynamic sql generation for the updatecommand is not supported against a selectcommand th' ...
Label: 0
Question:  b'"parameter with question mark and super in blank, i\'ve come across a method that is formatted like t' ...
Label: 1


### Prepare the dataset for training


In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda text, labels: text)

# Retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)

Question tf.Tensor(b'"blank8 why is my solution faster than the neat solution? (hackerrank chocolate feast) edit: simplified my solution..edit: removed opinion based secondary question...background: atarted learning blank a week or two ago using hackerranks problems as exercises and stackoverflow search + google as my teacher, i\'ve had some limited experience learning other languages...i did the exercise my own ""noobish learner way"" which i can\'t help but feel is a ""botched job"" when i see ""neat &amp; short"" solutions...however, when submitting both solutions one after another a couple of times i found the ""neat"" solution was quite a bit slower. ..i vaguely remember something about % operations being costly, is mine faster because of no % operations or is there more to it than just that?..exercise: https://www.hackerrank.com/challenges/chocolate-feast..neat solution from discussion:..import blank.io.*;.import blank.util.*;..public class solution {.    static int cc; .    publ

Binary model

In [None]:
VOCAB_SIZE = 10000

binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary')

def binary_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return binary_vectorize_layer(text), label

binary_vectorize_layer.adapt(train_text)

print("'binary' vectorized question:",  binary_vectorize_text(first_question, first_label)[0])

binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_val_ds = raw_val_ds.map(binary_vectorize_text)
binary_test_ds = raw_test_ds.map(binary_vectorize_text)

'binary' vectorized question: tf.Tensor([[1. 1. 1. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)


Integer model

In [None]:
MAX_SEQUENCE_LENGTH = 250

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

def int_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return int_vectorize_layer(text), label

int_vectorize_layer.adapt(train_text)

print("'int' vectorized question:", int_vectorize_text(first_question, first_label)[0])
print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[313])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

'int' vectorized question: tf.Tensor(
[[   1  111    6   23  299 1787  198    2 3623  299 7826    1    1  805
  2568   23    1 1218 3892  364 4145    1    1  661   16    5  981   45
   121 1881   47    1  742   36 9987    8 1982  322  662   36   23 2362
   195  543   83 2693 2609  661  144    1  411    2 1371   23  657    1
  6520   84   66    3  166  104   26 1182    6    5    1 1639   44    3
   189 3623  519 1135    1   44 5468  280 1272   71  156  157    5 1759
     9  331    3  227    2 3623  299  115  810    5  547 3833    3    1
  2371  146  202 1195  289    1    6 2179 1787  193    9  136 1195   45
     6   67  181    4   11  198  106    1    1  299   31    1 2216 1924
    29  299   53   28 2081   22   53   42  170  154  256    7   15  453
    28    1  237 2698    1  134 2698  131 2698  323 2698    1    1    1
     1    1    1   23    1 2216 1924   29  299   22   53   42  170  154
   256 1360   15  453   28  237    1  237   68    9  196  807   28 7167
     1  237    1 1364    1

#### Configure the dataset for performance

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

Binary model

In [None]:
binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)

Integer model

In [None]:
int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

### Create and compile the model

Binary model

In [None]:
binary_model = tf.keras.Sequential([layers.Dense(4)])

binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

Integer model

In [None]:
def create_model(vocab_size, num_labels):
  model = tf.keras.Sequential([
      layers.Embedding(vocab_size, 64, mask_zero=True),
      layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
      layers.GlobalMaxPooling1D(),
      layers.Dense(num_labels)
  ])
  return model

# vocab_size is VOCAB_SIZE + 1 since 0 is used additionally for padding.
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=4)

int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

### Train the model

Binary model

In [None]:
history = binary_model.fit(
    binary_train_ds, validation_data=binary_val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Integer model

In [None]:
history = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Export the model

##### Compare the two models

In [None]:
print("Linear model on binary vectorized data:")
print(binary_model.summary())

print("ConvNet model on int vectorized data:")
print(int_model.summary())

Linear model on binary vectorized data:
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 4)                 40004     
Total params: 40,004
Trainable params: 40,004
Non-trainable params: 0
_________________________________________________________________
None
ConvNet model on int vectorized data:
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          640064    
_________________________________________________________________
conv1d (Conv1D)              (None, None, 64)          20544     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense_1 (Dens

In [None]:
binary_loss, binary_accuracy = binary_model.evaluate(binary_test_ds)
print("Binary model accuracy: {:2.2%}".format(binary_accuracy))

int_loss, int_accuracy = int_model.evaluate(int_test_ds)
print("Int model accuracy: {:2.2%}".format(int_accuracy))

Binary model accuracy: 81.41%
Int model accuracy: 80.48%


##### Export the best model

In [None]:
export_model = tf.keras.Sequential(
    [binary_vectorize_layer, binary_model,
     layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy'])

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print("Accuracy: {:2.2%}".format(binary_accuracy))

Accuracy: 81.41%


### Test model with new data

In [None]:
def get_string_labels(predicted_scores_batch):
  predicted_int_labels = tf.argmax(predicted_scores_batch, axis=1)
  predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels)
  return predicted_labels

inputs = [
   # python
  "how do I extract keys from a dict into a list?",
  # java
  "debug public static void main(string[] args) {...}",
  # Python
  "how to call various strings without using %s? text1=""blank"".text2=""with me"".print(""study %(language)s"" %{'language':text1})....this works. but i am wondering whether it is using dictionary to call string? ...print(""study %(language)s %(with whom)"" %({'language':text1},{'with whom':text2}))...but it doesn't work. how can i fix it?...the error says 'format requires a mapping'",
  # Javascript
  "what's the point in blank window.onload? i never quite understood this what's the point in using window.onload like this if the blank load's anything not in a function as soon as the webpage loads?....window.onload = function() {.  alert(""hello"");.};",
  # Java
  "3 decimal precision in blank     float x = 4;.    float answer = 4/16;...the answer for this is 0.25, but i want to display the answer upto 3 decimal places, like 0.250...how to achieve that? please help?",
  # Csharp
  "the type or namespace name 'name' could not be found the type or namespace name 'question'could not be found (are you missing a using directive or an assembly reference?)..i've spent the morning reading threads with the same error, but haven't found anything that seems to apply. the references seem to be set up correctly. i tried adding ""using testproject.trivia.web"", but then i just get ""the type or namespace name 'web' does not exist in the namespace 'testproject.trivia' (are youmissing an assembly reference?)...in one project, i have this cs file:..using system.runtime.serialization;.using system.xml.serialization;..namespace testproject.trivia.{.[datacontract].public class course.{.    [xmlattribute, datamember].    public string name { get; set; }..    [xmlattribute, datamember].    public int id { get; set; }..    [xmlelement, datamember].    public question [] questions { get; set; }.}...}..it has a reference to class testproject.trivia.web, which has a javascript file with the definition of a question (which i'm referencing in the rest of the solution with no trouble)...any ideas will be appreciated..."
]

predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)

for input, label in zip(inputs, predicted_labels):
  print("Question: ", input)
  print("Predicted label: ", label.numpy(), '\n')

Question:  how do I extract keys from a dict into a list?
Predicted label:  b'python' 

Question:  debug public static void main(string[] args) {...}
Predicted label:  b'java' 

Question:  how to call various strings without using %s? text1=blank.text2=with me.print(study %(language)s %{'language':text1})....this works. but i am wondering whether it is using dictionary to call string? ...print(study %(language)s %(with whom) %({'language':text1},{'with whom':text2}))...but it doesn't work. how can i fix it?...the error says 'format requires a mapping'
Predicted label:  b'python' 

Question:  what's the point in blank window.onload? i never quite understood this what's the point in using window.onload like this if the blank load's anything not in a function as soon as the webpage loads?....window.onload = function() {.  alert(hello);.};
Predicted label:  b'javascript' 

Question:  3 decimal precision in blank     float x = 4;.    float answer = 4/16;...the answer for this is 0.25, but i

# Evaluate PDFs with exported model

In [None]:
import fitz  # this is pymupdf

def get_paragraphs(text):
  lines = text.split('\n')
  paraphs = []
  p = ""
  for line in lines:
    if line == ' ':
      paraphs.append(p)
      p = ""
    else:
      p += line
  return paraphs

In [None]:
pdf = "/content/drive/MyDrive/Sem VII/PIT Procesamiento inteligente de texto/Examen final/Programming languages.pdf"

with fitz.open(pdf) as doc:
  text = ""
  for page in doc:
      text += page.getText()

In [None]:
questions = get_paragraphs(text)

In [None]:
predicted_scores = export_model.predict(questions)
predicted_labels = get_string_labels(predicted_scores)

for input, label in zip(questions[:-1], predicted_labels[:-1]):
  print("Question: ", input)
  print("Predicted label: ", label.numpy(), '\n')

Question:  "null value appear in datagridview in after new column added my datagridview have one name column. i am trying to add one image column with that datagridview. once i add the image column it made that name column as null. please refer the below code:..public void call(string usr).{.    user = usr;.    querystring = ""select name 'name' from adk_my_prodsyssecgroup"";. try.    {.        specdataadapter = new sqldataadapter(querystring, con);.        specds = new dataset();.        con.open();.        specdataadapter.fill(specds, ""report_table"");. dgvdisp.datasource = specds;.        dgvdisp.datamember = ""report_table"";. dgvdisp.columns[dgvdisp.columncount - 1].autosizemode = datagridviewautosizecolumnmode.fill;.        con.close();..        if (user == ""secgrp"").        {. datagridviewimagecolumn set = new datagridviewimagecolumn();.            set.name = ""set"";. set.headertext = ""settings"";.            dgvdisp.columns.insert(1, set);.            for (int rows = 0; ro