In [34]:
from transformers import pipeline 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import math
import pandas as pd
from collections import Counter


In [35]:
model = pipeline("text-generation", model= "gpt2")

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [36]:
sentence = "i want to get the full marks"
num_documents = 3
generated_documents = []

for _ in range(num_documents):
    result = model(sentence, do_sample=True, top_k=50, temperature=0.9, max_length=50)
    generated_text = result[0]['generated_text']
    generated_documents.append(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [37]:
generated_documents

["i want to get the full marks and to be able to run and go when the ball goes with the ball in the air but this year we didn't have the balls at the right time and we had to find something we could throw the ball against",
 'i want to get the full marks."\n\nThe first time a man in his late 20s and early 30s was arrested following an altercation in L\'Enfant Plaza at a protest outside the Paris office of Vogue, a global brand',
 'i want to get the full marks of his team for the tournament, the winner will be announced at the top of the final page. I like to give a full list of all the players that are available in the game (I have a nice gallery']

In [38]:
# Print the generated documents
for i, document in enumerate(generated_documents):
    print(f"Generated Document {i+1}:")
    print(document)

Generated Document 1:
i want to get the full marks and to be able to run and go when the ball goes with the ball in the air but this year we didn't have the balls at the right time and we had to find something we could throw the ball against
Generated Document 2:
i want to get the full marks."

The first time a man in his late 20s and early 30s was arrested following an altercation in L'Enfant Plaza at a protest outside the Paris office of Vogue, a global brand
Generated Document 3:
i want to get the full marks of his team for the tournament, the winner will be announced at the top of the final page. I like to give a full list of all the players that are available in the game (I have a nice gallery


In [39]:
# Function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word,pos ='v') for word in tokens]
    return tokens


In [40]:
preprocessed_text = []
for doc in generated_documents:
    document = preprocess_text(doc)
    preprocessed_text.append(document)

In [41]:
preprocessed_text

[['want',
  'get',
  'full',
  'mark',
  'able',
  'run',
  'go',
  'ball',
  'go',
  'ball',
  'air',
  'year',
  'ball',
  'right',
  'time',
  'find',
  'something',
  'could',
  'throw',
  'ball'],
 ['want',
  'get',
  'full',
  'mark',
  'first',
  'time',
  'man',
  'late',
  'early',
  'arrest',
  'follow',
  'altercation',
  'l',
  'enfant',
  'plaza',
  'protest',
  'outside',
  'paris',
  'office',
  'vogue',
  'global',
  'brand'],
 ['want',
  'get',
  'full',
  'mark',
  'team',
  'tournament',
  'winner',
  'announce',
  'top',
  'final',
  'page',
  'like',
  'give',
  'full',
  'list',
  'players',
  'available',
  'game',
  'nice',
  'gallery']]

In [42]:
# delete tokens with length less than 3
last_list= []

for list_ in preprocessed_text:
    l = []
    for token in list_: 
        if len(token)>=3:
            l.append (token)
    last_list.append(l)

print(last_list)

[['want', 'get', 'full', 'mark', 'able', 'run', 'ball', 'ball', 'air', 'year', 'ball', 'right', 'time', 'find', 'something', 'could', 'throw', 'ball'], ['want', 'get', 'full', 'mark', 'first', 'time', 'man', 'late', 'early', 'arrest', 'follow', 'altercation', 'enfant', 'plaza', 'protest', 'outside', 'paris', 'office', 'vogue', 'global', 'brand'], ['want', 'get', 'full', 'mark', 'team', 'tournament', 'winner', 'announce', 'top', 'final', 'page', 'like', 'give', 'full', 'list', 'players', 'available', 'game', 'nice', 'gallery']]


In [43]:
# Function to get unique words
def get_unique_words(tokens):
    return set(tokens)


In [44]:
unique_words = []
for doc in last_list:
    document = get_unique_words(doc)
    unique_words.append(list(document))

In [45]:
unique_words

[['could',
  'able',
  'ball',
  'year',
  'mark',
  'air',
  'run',
  'time',
  'something',
  'throw',
  'right',
  'full',
  'find',
  'want',
  'get'],
 ['man',
  'late',
  'brand',
  'mark',
  'protest',
  'first',
  'altercation',
  'get',
  'global',
  'paris',
  'arrest',
  'enfant',
  'outside',
  'follow',
  'full',
  'plaza',
  'office',
  'early',
  'time',
  'vogue',
  'want'],
 ['like',
  'page',
  'announce',
  'give',
  'players',
  'tournament',
  'game',
  'final',
  'mark',
  'top',
  'available',
  'get',
  'team',
  'full',
  'want',
  'list',
  'nice',
  'winner',
  'gallery']]

In [46]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = vectorizer.fit_transform(last_list)
builtin_results_dataframe = pd.DataFrame(
    tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()
)


In [47]:
print(builtin_results_dataframe)

       able       air  altercation  announce    arrest  available      ball  \
0  0.192544  0.192544     0.000000  0.000000  0.000000   0.000000  0.770175   
1  0.000000  0.000000     0.235875  0.000000  0.235875   0.000000  0.000000   
2  0.000000  0.000000     0.000000  0.239444  0.000000   0.239444  0.000000   

      brand     could     early  ...  something      team     throw      time  \
0  0.000000  0.192544  0.000000  ...   0.192544  0.000000  0.192544  0.146435   
1  0.235875  0.000000  0.235875  ...   0.000000  0.000000  0.000000  0.179389   
2  0.000000  0.000000  0.000000  ...   0.000000  0.239444  0.000000  0.000000   

        top  tournament     vogue      want    winner      year  
0  0.000000    0.000000  0.000000  0.113720  0.000000  0.192544  
1  0.000000    0.000000  0.235875  0.139311  0.000000  0.000000  
2  0.239444    0.239444  0.000000  0.141420  0.239444  0.000000  

[3 rows x 46 columns]


In [48]:
# num of times term t in document / total num of terms in document
def calculate_tf(document_words):
    tf_document = {}
    total_terms = len(document_words)
    for term in set(document_words):
        tf_document[term] = document_words.count(term) / total_terms
    return tf_document


In [49]:
print(calculate_tf(last_list[0]))

{'could': 0.05555555555555555, 'able': 0.05555555555555555, 'ball': 0.2222222222222222, 'year': 0.05555555555555555, 'mark': 0.05555555555555555, 'air': 0.05555555555555555, 'run': 0.05555555555555555, 'time': 0.05555555555555555, 'something': 0.05555555555555555, 'throw': 0.05555555555555555, 'right': 0.05555555555555555, 'full': 0.05555555555555555, 'find': 0.05555555555555555, 'want': 0.05555555555555555, 'get': 0.05555555555555555}


In [50]:
def calculate_idf(corpus, term):
    total_documents = len(corpus)
    document_containing_term = sum([1 for document in corpus if term in document])
    if document_containing_term > 0:
        return math.log10(total_documents / (document_containing_term+1)) + 1
    else:
        return 0  

In [51]:
calculate_idf(last_list,"player")

0