# Dependency Parsing

## Load the spaCy Model

In [None]:
import spacy

# Load the small English language model
nlp = spacy.load("en_core_web_sm")

## Parse a Sentence

In [None]:
sentence = "The quick brown fox jumps over the lazy dog."

# Process the sentence using the spaCy model
doc = nlp(sentence)

# Print dependency information for each token
for token in doc:
    print(f"{token.text} -> {token.dep_} -> {token.head.text}")

## Visualize the Dependency Tree

In [None]:
from spacy import displacy

# Render the dependency tree for the sentence
displacy.render(doc, style="dep", jupyter=True)

Dependency Labels Explained

To understand what the dependency labels represent, you can refer to spaCy's official documentation. Common labels include:

    nsubj (nominal subject)
    dobj (direct object)
    prep (preposition)
    pobj (object of preposition)

In [None]:
for token in doc:
    print(f"Word: {token.text}")
    print(f"  Dependency Label: {token.dep_}")
    print(f"  Head Word: {token.head.text}")
    print(f"  POS: {token.pos_}")
    print(f"  Children: {[child.text for child in token.children]}")

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
graph = nx.Graph()

# Iterate through the tokens and add edges based on dependencies
for token in doc:
  graph.add_node(token.text)
  # Add edges between the token and its head word (dependency relationship)
  graph.add_edge(token.text, token.head.text)


# Calculate node size based on the number of connections
node_sizes = [graph.degree(node) * 500 for node in graph.nodes()]


# Draw the graph
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
pos = nx.spring_layout(graph) # You can experiment with other layout algorithms like nx.circular_layout
nx.draw(graph, pos, with_labels=True, node_size=node_sizes, font_size=10, node_color='skyblue', edge_color='gray')

plt.title("Dependency Graph")
plt.show()

## We create an key words finder application

In [None]:
text_input = input("Enter your senctenc: ")
text_input

# Token Classification

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

wnut = load_dataset("wnut_17")

In [None]:
wnut

In [None]:
import pandas as pd
from datasets import load_dataset

wnut = load_dataset("wnut_17")

# Convert train dataset to pandas DataFrame
train_df = pd.DataFrame(wnut['train'])

# Convert validation dataset to pandas DataFrame
validation_df = pd.DataFrame(wnut['validation'])

# Convert test dataset to pandas DataFrame
test_df = pd.DataFrame(wnut['test'])

In [None]:
print(train_df["tokens"].values[0])
print(train_df["ner_tags"].values[0])

In [None]:
# ToDo: How many null values do we have?

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["tokens"].values)

In [None]:
max_sequence_length = 64

X_train = tokenizer.texts_to_sequences(train_df["tokens"].values)
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding="post", truncating="post")
y_train = pad_sequences(train_df["ner_tags"].values, maxlen=max_sequence_length, padding="post", truncating="post")

X_val = tokenizer.texts_to_sequences(validation_df["tokens"].values)
X_val = pad_sequences(X_val, maxlen=max_sequence_length, padding="post", truncating="post")
y_val = pad_sequences(validation_df["ner_tags"].values, maxlen=max_sequence_length, padding="post", truncating="post")

X_train.shape, y_train.shape, X_val.shape, y_val.shape

## Convolution function

<img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/Convolution_of_box_signal_with_itself2.gif">


<img src="https://upload.wikimedia.org/wikipedia/commons/b/b9/Convolution_of_spiky_function_with_box2.gif">

## 2D Conv. Function

<img src="https://www.researchgate.net/profile/Ihab-S-Mohamed/publication/324165524/figure/fig3/AS:611103423860736@1522709818959/An-example-of-convolution-operation-in-2D-2.png">

## 1D Conv. Function

<img src="https://i.sstatic.net/WNIXd.png">

In [None]:
import tensorflow as tf

embedding_dim = 64
num_classes = len(set([tag for row in train_df['ner_tags'] for tag in row]))

# Modell építése
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length, name="embedding"),
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation="relu", padding="same", name="conv1d"),
    tf.keras.layers.Dropout(0.2, name="dropout"),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

In [None]:
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

In [None]:
# ToDo: Create simple prediction on test set

In [None]:
# ToDo: Measure the accurracy on the test set