In [1]:
# Install the necessary libraries
!pip install pandas
!pip install openpyxl



In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the xlsx files
text_summary_data = pd.read_excel('text_summary_datasets_v2.xlsx')
training_data = pd.read_excel('training_data_v2.xlsx')

In [3]:
# Displaying the first few rows of the training data
training_data.head()

Unnamed: 0,Index,Category,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767,dim_768
0,0,0,-0.882278,-0.647234,0.050173,-0.448188,-0.175582,0.125284,-0.335781,-0.396106,...,0.56847,-0.326577,0.026089,-0.407658,-0.162295,-0.121949,-0.386429,0.135763,0.516049,0.731324
1,1,0,-0.702665,-0.462591,0.162085,-0.029182,-0.280842,0.047459,0.109864,-0.54021,...,0.24944,-0.422917,0.03382,-0.226271,-0.324386,-0.036914,-0.588373,-0.344278,0.329853,0.195897
2,2,0,-0.327257,-0.397209,0.035037,-0.064671,-0.435734,0.535525,0.134867,-0.213102,...,0.710835,-0.19151,-0.068998,-0.262279,-0.214397,0.095195,-0.503536,0.142249,0.206015,0.182094
3,3,0,-0.793734,-0.470964,-0.278644,-0.292047,-0.565868,0.546791,0.604674,-0.06919,...,0.387789,-0.596448,-0.291108,-0.320205,-0.362207,0.179917,-0.600026,-0.200465,0.776508,-0.155819
4,4,0,-0.27284,-0.556684,0.001737,-0.229906,-0.495732,0.176596,-0.141926,-0.352247,...,0.660122,-0.534026,0.347033,-0.279629,-0.397189,0.226515,-0.547098,0.431136,0.102714,0.422797


In [4]:
# Splitting the dataset into features (X) and target (y)
x = training_data.drop(columns=["Index","Category"])
y = training_data["Category"]

In [21]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [6]:
## Random Forest Classifier
# Initializing the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the model
rf_clf.fit(X_train, y_train)

# Making predictions
rf_y_pred = rf_clf.predict(X_test)

# Evaluating the model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)
rf_class_report = classification_report(y_test, rf_y_pred)

print("Random Forest Classifier")
print(f"Accuracy: {rf_accuracy}")
print("Confusion Matrix:")
print(rf_conf_matrix)
print("Classification Report:")
print(rf_class_report)

Random Forest Classifier
Accuracy: 0.9
Confusion Matrix:
[[11  0  0  0]
 [ 1 17  2  0]
 [ 0  1 14  0]
 [ 0  2  0 12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       0.85      0.85      0.85        20
           2       0.88      0.93      0.90        15
           3       1.00      0.86      0.92        14

    accuracy                           0.90        60
   macro avg       0.91      0.91      0.91        60
weighted avg       0.90      0.90      0.90        60



In [7]:
## Support Vector Machine Classifier
# Initializing the SVM classifier
svm_clf = SVC(kernel='linear', random_state=42)

# Training the model
svm_clf.fit(X_train, y_train)

# Making predictions
svm_y_pred = svm_clf.predict(X_test)

# Evaluating the model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_conf_matrix = confusion_matrix(y_test, svm_y_pred)
svm_class_report = classification_report(y_test, svm_y_pred)

print("\nSupport Vector Machine Classifier")
print(f"Accuracy: {svm_accuracy}")
print("Confusion Matrix:")
print(svm_conf_matrix)
print("Classification Report:")
print(svm_class_report)


Support Vector Machine Classifier
Accuracy: 0.9833333333333333
Confusion Matrix:
[[11  0  0  0]
 [ 1 19  0  0]
 [ 0  0 15  0]
 [ 0  0  0 14]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.95      0.97        20
           2       1.00      1.00      1.00        15
           3       1.00      1.00      1.00        14

    accuracy                           0.98        60
   macro avg       0.98      0.99      0.98        60
weighted avg       0.98      0.98      0.98        60



In [10]:
# import required libs
import pandas as pd
from transformers import BertTokenizer, BertModel
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import torch

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [11]:
# tokenizer and bert
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")



In [13]:
# nltk libs
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# text pre-processing function
def preprocess_text(text):
    # case standardization
    text = text.lower() # dont care about capitalization yet
    
    # puntuation removal
    text = text.replace('"', '') # our text consists of multiple sentences, some punctuations are needed

    # tokenized text
    tokens = tokenizer.tokenize(text)
    
    # stop word removal
    # new_tokens = []
    # for token in tokens:
    #    if token.lower() not in stop_words:
    #        new_tokens.append(token)
    '''
    original:
    ['the', 'diagnosis', 'of', 'v', '##kh', 'followed', 'revised', 'diagnostic', 'criteria', 'by', 'the', 'internation', ...]
    remove stop words:
    ['diagnosis', 'v', '##kh', 'followed', 'revised', 'diagnostic', 'criteria', 'international', ...]
    Thus don't remove stop words, it might lead to poor BERT semantic understand. 
    '''

    # lemmatizer and stemmer
    # Lemmatization and Stemming
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # print(tokens)
    # print(lemmatized_tokens)
    # print(stemmed_tokens)
    '''
    lemmatized:
    ['the', 'diagnosis', 'of', 'v', '##kh', 'followed', 'revised', 'diagnostic', 'criterion', 'by', 'the', 'international', ...]
    stemmed:
    ['the', 'diagnosi', 'of', 'v', '##kh', 'follow', 'revis', 'diagnost', 'criteria', 'by', 'the', 'intern', ...]
    Stemmed is bad here, choose lemmatizer over stemmer.
    '''

    # change tokens back to senteces
    def detokenize(tokens):
        new_tokens = []
        for token in tokens:
            if token.startswith("##"):
                new_tokens[-1] += token[2:]
            else:
                new_tokens.append(token)
        text = " ".join(new_tokens)
        text = re.sub(r'\s([?.!,\'-](?:\s|$))', r'\1', text)
        return text

    text = detokenize(lemmatized_tokens)

    # capitalize first alphabet of each sentence
    text = re.sub(r"(^|[.!?]\s+)(\w+)", lambda match: match.group(1) + match.group(2).capitalize(), text)

    return text

In [15]:
# BERT
'''
choose BERT to get text semantic meaning to be used for classification and clustering. This is more advanced than keywords counting.
'''
def toBert(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    outputs = bert_model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    return outputs.last_hidden_state[0, 0, :]

In [12]:
# Upload Real Case Dataset


In [87]:
new_paragraph = "Seven out of 12 patients no longer needed daily insulin shots after receiving a full dose of the gene therapy, dubbed VX-800, researchers reported Friday at the American Diabetes Association annual meeting in Orlando, FL. Another two needed about 70% less insulin daily to keep their blood sugar stable, results show. This positive data adds to the growing body of evidence for VX-880 potential to revolutionize the treatment of type 1 diabetes, said researcher Dr. Piotr Witkowski, director of the pancreatic and islet transplant program at the University of Chicago. People with type 1 diabetes arent able to produce enough insulin to keep blood sugar levels stable. Type 1 diabetes occurs when the immune system mistakenly targets and attacks the islet cells in the pancreas that generate insulin."

In [48]:
new_paragraph = "The film is brief and its AI origins are only really obvious when it is paused. Otherwise, you might think it was simply the victim of an overly enthusiastic editor with access to some powerful visual effects software and actors who don't know how to perform in front of a green screen. Overall, it manages to mostly avoid the uncanny valley except for when the young founder smiles, then it a little too much like watching The Polar Express. Still, when considering it was produced with the alpha version of Sora and with relatively limited time and resources, you can see why some are very excited about Sora. Through Sora, we were able to tell this incredible story with remarkable speed and efficiency, Native Foreign Chief Creative Officer and the film's director Nik Kleverov said in a statement.  Toys R Us is the perfect brand to embrace this AI-forward strategy, and we are thrilled to collaborate with their creative team to help lead the next wave of innovative storytelling."

In [100]:
new_paragraph = "ou might be familiar with spreadsheets or databases. A blockchain is somewhat similar because it is a database where information is entered and stored. But the key difference between a traditional database or spreadsheet and a blockchain is how the data is structured and accessed. A blockchain consists of programs called scripts that conduct the tasks you usually would in a database: Entering and accessing information and saving and storing it somewhere. A blockchain is distributed, which means multiple copies are saved on many machines, and they must all match for it to be valid."

In [113]:
new_paragraph = "Programmable networking protocols, such as OpenFlow, direct traffic among network devices in an SDN network. The Open Networking Foundation (ONF) helped to standardize the OpenFlow protocol and other open source SDN technologies. By combining these components, organizations get a simpler, centralized way to manage networks. SDN strips away the routing and packet forwarding functions, known as the control plane, from the data plane or underlying infrastructure. SDN then implements controllers, considered the brain of the SDN network, and layers them above the network hardware in the cloud or on-premises. This lets teams use policy-based management—a kind of automation—to manage network control directly. SDN controllers tell switches where to send packets. In some cases, virtual switches embedded in software or hardware replace the physical switches. This consolidates their functions into a single, intelligent switch that can check data packets and their virtual machine destinations to ensure there are no issues before moving packets along."

In [114]:
text = preprocess_text(new_paragraph)

In [115]:
outputs = toBert(text)

In [116]:
outputs.shape

torch.Size([768])

In [117]:
outputs = outputs.detach().numpy()

In [118]:
outputs = np.expand_dims(outputs, axis=0)

In [119]:
outputs.shape

(1, 768)

In [120]:
columns = [f'dim_{i}' for i in range(1, 769)]

In [121]:
df = pd.DataFrame(outputs, columns=columns)

In [122]:
df.shape

(1, 768)

In [123]:
type(df)

pandas.core.frame.DataFrame

In [124]:
# Making predictions
svm_y_pred = svm_clf.predict(df)

In [125]:
svm_y_pred

array([2], dtype=int64)

In [38]:
# Making predictions
svm_y_pred = svm_clf.predict(X_test)



In [41]:
X_test

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767,dim_768
95,-0.371783,-0.08501,-0.433551,-0.211489,-0.408652,-0.202125,-0.008467,-0.245964,0.55935,-0.530172,...,-0.107318,-0.165534,-0.584371,-0.265153,-0.173002,0.218897,-0.270811,-0.35751,0.390698,0.213535
15,-0.885291,-0.909994,0.433861,-0.002347,-0.395468,0.216916,-0.446348,-0.790324,-0.121649,-0.620416,...,0.638731,-0.707157,0.157536,0.198276,-0.420429,0.145729,-0.321997,0.001677,0.006103,0.649307
30,-0.530586,-0.119905,-0.451796,-0.104974,-0.374829,-0.344402,0.482314,-0.382636,0.021224,-0.300759,...,0.538399,0.167141,-0.266599,0.080349,0.046358,0.08023,-0.494154,-0.125537,-0.043936,0.432533
158,-0.329005,-0.14742,0.434372,0.019946,-0.105901,-0.29489,-0.16002,-0.167452,-0.32815,-1.117589,...,0.041788,-0.142579,0.396165,-0.450022,-0.325618,0.589626,0.021691,-0.774367,0.119074,0.461055
128,-0.239914,-0.09828,0.084705,0.337961,-0.399834,-0.465882,0.228993,0.127405,-0.79295,-0.948451,...,0.354497,-0.480279,0.154974,-0.21174,-0.377912,0.381833,0.074501,-0.509713,0.030265,0.639959
115,-0.576521,-0.189101,0.202255,-0.000682,-0.078973,-0.169888,0.302044,0.129218,-0.528236,-0.870564,...,0.37391,-0.143583,0.396282,-0.110395,0.179224,-0.176811,-0.392302,-0.510358,0.204336,-0.141987
69,-0.321925,-0.36346,-0.358947,0.098452,-0.53383,0.247866,0.045391,0.03754,-0.16307,-0.411524,...,0.209306,-0.704053,0.220041,-0.211642,0.259537,0.229737,-0.399729,0.249843,-0.011724,0.631177
170,-0.336527,0.025666,0.104494,-0.04485,-0.48551,-0.216635,0.424055,0.138582,-0.270887,-0.844192,...,0.233932,-0.089502,0.204238,-0.287667,0.082469,0.133605,-0.204386,-0.191398,0.020545,-0.00923
174,-0.183418,-0.278766,-0.088772,-0.057157,-0.578566,0.220462,0.247467,0.096719,0.064276,-0.434197,...,0.368385,0.264564,-0.088604,-0.116821,-0.367711,0.261399,-0.282846,0.018534,0.330923,0.097792
45,0.076175,-0.680333,0.340795,-0.245307,0.007886,0.420016,0.127318,0.208885,0.240229,-0.44663,...,0.199572,-0.825432,0.030043,-0.281108,-0.182499,0.06628,-0.322691,0.026601,0.62938,0.248792
