In [2]:
!pip install spacy
!python -m spacy download en_core_web_md
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from transformers import pipeline
import numpy as np

# Load pre-trained Spacy model for NLP processing
nlp = spacy.load('en_core_web_md')

# Preprocessing function (tokenization, lemmatization, removing stopwords)
def preprocess(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Sample job description and resumes
job_description = preprocess("Looking for a software engineer with experience in Python, Django, and SQL.")
resumes = [
    preprocess("Software engineer skilled in Python, Django, SQL, and web development."),
    preprocess("Python developer with experience in Flask, machine learning, and data analysis."),
    preprocess("Experienced backend developer with knowledge of Ruby on Rails, JavaScript, and MySQL.")
]

# Convert text to numerical vectors using TF-IDF
vectorizer = TfidfVectorizer()
all_texts = [job_description] + resumes  # Combine JD and resumes
tfidf_matrix = vectorizer.fit_transform(all_texts)  # Vectorization

# Calculate similarity between JD and each resume
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

# Print similarity scores (for initial ranking based on similarity)
print("Similarity Scores:", similarity_scores)

# Creating labels (assumed relevance scores for training purposes)
# In a real-world scenario, these labels would come from domain experts or historical data
y = [0.9, 0.7, 0.4]  # Example scores for resumes

# Train a RandomForest model to predict relevance scores based on vectorized input
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix[1:].toarray(), y, test_size=0.2)

# Train RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict relevance scores for test data (for unseen resumes)
predicted_scores = model.predict(X_test)
print("Predicted Relevance Scores:", predicted_scores)

# AIOps: System Performance Monitoring with Anomaly Detection
# Simulate system response times (e.g., time taken to process resumes)
response_times = [0.12, 0.15, 0.2, 2.0, 0.14, 0.18]  # Simulated response times in seconds

# Detect anomalies using Isolation Forest
anomaly_detector = IsolationForest(contamination=0.1)
anomalies = anomaly_detector.fit_predict(np.array(response_times).reshape(-1, 1))

# Check for anomalies and print alert if detected
if -1 in anomalies:
    print("Anomaly detected in system performance!")

# (Optional) LLMOps: Use transformer models to provide feedback on resumes
qa_pipeline = pipeline("question-answering")

# Example feedback generation using a transformer model (based on similarity with JD)
context = "The resume contains experience in Python, Django, and SQL."
question = "What is missing from the resume based on the job description?"
result = qa_pipeline(question=question, context=context)

# Output feedback from the LLM
print

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Similarity Scores: [0.57474379 0.1548623  0.07409533]
Predicted Relevance Scores: [0.778]


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Anomaly detected in system performance!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


<function print>

In [3]:
!pip install nltk tensorflow numpy




In [4]:
# @title Building a Text Generation Model with Python


import nltk
import numpy as np
import re
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Download necessary NLTK resources
nltk.download('gutenberg')
nltk.download('punkt')


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Load data from Project Gutenberg (e.g., Shakespeare's Hamlet)
raw_text = gutenberg.raw('shakespeare-hamlet.txt')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    return tokens

tokens = preprocess_text(raw_text)


In [6]:
# Prepare sequences
sequence_length = 50
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
sequences = []

for i in range(sequence_length, len(tokens)):
    seq = tokens[i-sequence_length:i]
    sequences.append(seq)

# Convert sequences to integer values
sequences = tokenizer.texts_to_sequences(sequences)
sequences = np.array(sequences)

# Separate features and labels
X, y = sequences[:, :-1], sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)


In [7]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=sequence_length-1))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model
model.summary()




In [9]:
# Train the model
history = model.fit(X, y, epochs=50, batch_size=256, verbose=1)


Epoch 1/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 46ms/step - accuracy: 0.1735 - loss: 4.2473
Epoch 2/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 50ms/step - accuracy: 0.1784 - loss: 4.1776
Epoch 3/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 44ms/step - accuracy: 0.1901 - loss: 4.1156
Epoch 4/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.1989 - loss: 4.0569
Epoch 5/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.2094 - loss: 3.9856
Epoch 6/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.2138 - loss: 3.9290
Epoch 7/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.2284 - loss: 3.8664
Epoch 8/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.2340 - loss: 3.8106
Epoch 9/50
[1m116/116[0m [32m

In [10]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

def generate_text(model, seed_text, max_length):
    for _ in range(max_length):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_length-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_index = np.argmax(predicted)
        next_word = reverse_word_map[next_index]
        seed_text += " " + next_word
        if next_word == '.':
            break
    return seed_text

# Generate text based on a seed prompt
print(generate_text(model, "To be or not to be", 50))


To be or not to be too heauy in my memory commendable to this the plays the queene embracing him start comedie historie pastorall pastoricallcomicallhistoricallpastorall tragicallhistoricall tragicallcomicallhistoricallpastorall scene indiuidible and norman haue not borne her power breake seeme to dye to sleepe to sleepe to sleepe to sleepe to sleepe to sleepe to sleepe to sleepe


In [11]:
model.save('text_generation_model.h5')




In [12]:
import math

def calculate_perplexity(model, X, y):
    predictions = model.predict(X)
    # Take the actual values (the target words) from y
    perplexity = 0
    total_samples = len(y)
    for i, target in enumerate(y):
        predicted_prob = predictions[i][np.argmax(target)]
        perplexity += -math.log(predicted_prob)
    perplexity = math.exp(perplexity / total_samples)
    return perplexity

# Calculate perplexity on the training data
perplexity = calculate_perplexity(model, X, y)
print(f"Model Perplexity: {perplexity:.2f}")


[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step
Model Perplexity: 6.80


In [13]:
# Example prompts to test the model
test_prompts = [
    "Once upon a time",
    "In a galaxy far far away",
    "The quick brown fox",
    "Artificial Intelligence is",
    "To be or not to be"
]

# Generate text for each test prompt
for prompt in test_prompts:
    generated_text = generate_text(model, prompt, 50)
    print(f"\nPrompt: '{prompt}'")
    print(f"Generated Text: '{generated_text}'")



Prompt: 'Once upon a time'
Generated Text: 'Once upon a time of my father vowes ham is my causes right to the vnsatisfied hor twere the cup an imperiall ioyntresse of his horson life and sweete religion makes a rapsidie of life that we haue seene that see thou wilt i haue the very coynage of the king king i haue'

Prompt: 'In a galaxy far far away'
Generated Text: 'In a galaxy far far away of my seruants for to speake for my father brands the harlot sits in the shoulder of your saile exeunt to vs ophe i am not spleenatiue and rash i saw thee quietly enurnd hath opd his ponderous and marble iawes to cast thee into a puft and recklesse libertine'

Prompt: 'The quick brown fox'
Generated Text: 'The quick brown fox of all makes vs at their assignes and most obseruant ayre got cleare of mine owne death oh gertrude be seene for the top of my compasse and in the cup an vnion shal vse his foyle and target the louer shall not sigh gratis the humorous ayre shall tell'

Prompt: 'Artificial Intel