In [1]:
import pandas as pd
import json

def load_data(path):
    with open(path, 'r') as file:
        data = file.read()

    data = data.strip().split('\n')

    dataset = []
    for i in data:
        try:
            dataset.append(json.loads(i))
        except:
            print(i)

    df = pd.DataFrame(dataset)

    return df

TRAIN = load_data('data/train/train.json')
TRAIN


Unnamed: 0,topic,question,excerpt
0,electronics,What is the effective differencial effective o...,"I'm trying to work out, in general terms, the ..."
1,electronics,Heat sensor with fan cooling,Can I know which component senses heat or acts...
2,electronics,Outlet Installation--more wires than my new ou...,I am replacing a wall outlet with a Cooper Wir...
3,electronics,Buck Converter Operation Question,"i have been reading about the buck converter, ..."
4,electronics,"Urgent help in area of ASIC design, verificati...",I need help with deciding on a Master's Projec...
...,...,...,...
20214,wordpress,How to set a Custom Post Type as the parent of...,I have a Custom Post Type called Recipe with p...
20215,wordpress,Tracking last login and last visit,I'm using the code below to track when a user ...
20216,wordpress,How to exclude the particular category from th...,"add_action( 'pre_get_posts', 'custom_pre_get_p..."
20217,wordpress,display sub categories assoccited with each po...,i have wordpress blog with many posts. each po...


In [2]:
TRAIN.columns

Index(['topic', 'question', 'excerpt'], dtype='object')

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Function to clean text and perform lemitisation"""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [5]:
df = TRAIN.copy()
df['text'] = df['question'] + " " + df['excerpt']
df['text'] = df['text'].apply(preprocess_text)
df = df[['text', 'topic']]

In [6]:
df['text'][0]

'effective differencial effective circuit im trying work general term effective capacitance circuit see diagram httpistackimgurcombs85bpng effective capacitance circuit'

## TF-IDF Vectorizer Methord

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def vectorirse_text(text):
    """ Recieves text as input and returns TF-IDF vectors"""
    tfidf = TfidfVectorizer(max_features=500000)
    X = tfidf.fit_transform(text)
    return X

def label_encoding(input):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(input)


X = vectorirse_text(df['text'])
y = label_encoding(df['topic'])



In [None]:
from models import *
print("Logistic Regression:")
report, acc = logistic_regression(X, y)
print("Accuracy:", acc)
print(report)

print("\nDecision Tree Classifier:")
report, acc = decision_tree(X, y)
print("Accuracy:", acc)
print(report)

print("\nRandom Forest Classifier:")
report, acc = random_forest(X, y)
print("Accuracy:", acc)
print(report)

print("\nSupport Vector Machine (SVM):")
report, acc = support_vector_machine(X, y)
print("Accuracy:", acc)
print(report)

print("\nk-Nearest Neighbors (k-NN):")
report, acc = knn(X, y, k=5)
print("Accuracy:", acc)
print(report)

print("\nNaïve Bayes Classifier:")
report, acc = naive_bayes(X, y)
print("Accuracy:", acc)
print(report)

print("\nGradient Boosting (XGBoost):")
report, acc = xgboost_classifier(X, y)
print("Accuracy:", acc)
print(report)

print("\nMulti-Layer Perceptron (MLP - Neural Network):")
report, acc = mlp_classifier(X, y)

print("Accuracy:", acc)
print(report)



Multi-Layer Perceptron (MLP - Neural Network):
Accuracy: 0.7925321463897131
              precision    recall  f1-score   support

           0       0.80      0.77      0.79       458
           1       0.61      0.72      0.66       396
           2       0.91      0.80      0.85       363
           3       0.75      0.92      0.83       497
           4       0.73      0.74      0.73       294
           5       0.83      0.84      0.84       368
           6       0.99      0.82      0.89       498
           7       0.74      0.78      0.76       388
           8       0.73      0.66      0.69       396
           9       0.90      0.83      0.86       386

    accuracy                           0.79      4044
   macro avg       0.80      0.79      0.79      4044
weighted avg       0.80      0.79      0.79      4044



## Word2Vector Methord

In [None]:
import gensim
import numpy as np
from sklearn.ensemble import RandomForestClassifier

df['tokens'] = df['text'].apply(lambda x: x.split())

w2v_model = gensim.models.Word2Vec(sentences=df['tokens'], vector_size=10000, window=5, min_count=2, workers=10)

def get_sentence_embedding(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_w2v = np.array(df['tokens'].apply(get_sentence_embedding).tolist())
y_encoded = label_encoding(df['topic'])


[ 0.0416455   0.02837399  0.00414959 ... -0.04002301 -0.01680034
 -0.0022278 ]
Word2Vec + Random Forest Accuracy: 0.6392


In [11]:
X = X_w2v
y = y_encoded

In [12]:
print("Logistic Regression:")
report, acc = logistic_regression(X, y)
print("Accuracy:", acc)
print(report)

print("\nDecision Tree Classifier:")
report, acc = decision_tree(X, y)
print("Accuracy:", acc)
print(report)

print("\nRandom Forest Classifier:")
report, acc = random_forest(X, y)
print("Accuracy:", acc)
print(report)

print("\nSupport Vector Machine (SVM):")
report, acc = support_vector_machine(X, y)
print("Accuracy:", acc)
print(report)

print("\nk-Nearest Neighbors (k-NN):")
report, acc = knn(X, y, k=5)
print("Accuracy:", acc)
print(report)

print("\nNaïve Bayes Classifier:")
report, acc = naive_bayes(X, y)
print("Accuracy:", acc)
print(report)

print("\nGradient Boosting (XGBoost):")
report, acc = xgboost_classifier(X, y)
print("Accuracy:", acc)
print(report)

print("\nMulti-Layer Perceptron (MLP - Neural Network):")
report, acc = mlp_classifier(X, y)

print("Accuracy:", acc)
print(report)

Logistic Regression:




Accuracy: 0.6414441147378833
              precision    recall  f1-score   support

           0       0.78      0.75      0.76       458
           1       0.52      0.46      0.49       396
           2       0.57      0.67      0.62       363
           3       0.66      0.68      0.67       497
           4       0.53      0.50      0.51       294
           5       0.74      0.77      0.75       368
           6       0.68      0.81      0.74       498
           7       0.58      0.50      0.54       388
           8       0.50      0.47      0.49       396
           9       0.76      0.70      0.73       386

    accuracy                           0.64      4044
   macro avg       0.63      0.63      0.63      4044
weighted avg       0.64      0.64      0.64      4044


Decision Tree Classifier:
Accuracy: 0.5044510385756676
              precision    recall  f1-score   support

           0       0.66      0.65      0.65       458
           1       0.38      0.40      0.39    

ValueError: Negative values in data passed to MultinomialNB (input X).

In [11]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained BERT tokenizer & model on GPU
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)  # Move model to GPU

# Function to get BERT embeddings using GPU
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # Move result back to CPU for NumPy

# Convert text to BERT embeddings with progress bar
X_bert = np.array([get_bert_embedding(text) for text in tqdm(df['text'], desc="Generating BERT Embeddings on GPU")])


# Train-test split
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(X_bert, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_bert, y_train_bert)

# Predict & Evaluate
y_pred_bert = lr_model.predict(X_test_bert)
bert_acc = accuracy_score(y_test_bert, y_pred_bert)
print(f"BERT + Logistic Regression Accuracy: {bert_acc:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Generating BERT Embeddings on GPU: 100%|██████████| 20219/20219 [02:22<00:00, 141.59it/s]


BERT + Logistic Regression Accuracy: 0.8029


In [12]:
TRAIN.columns

Index(['topic', 'question', 'excerpt'], dtype='object')

In [None]:
df['x_bert']=X_bert.tolist()
df['x_w2v']=X_w2v.tolist()
df['x_tfidf']=X.toarray().tolist()
df['question']=TRAIN['question']
df['excerpt']=TRAIN['excerpt']
df.to_csv('df.csv', index=False)

In [None]:
# import torch
# from transformers import BertTokenizer, BertModel
# import numpy as np
# from tqdm import tqdm

# # Check if GPU is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Load pre-trained BERT tokenizer & model on GPU
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)  # Move model to GPU

# # Function to get BERT embeddings using GPU
# def get_bert_embedding(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
#     with torch.no_grad():
#         outputs = bert_model(**inputs)
#     return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # Move result back to CPU for NumPy

# # Convert text to BERT embeddings with progress bar
# X_bert = np.array([get_bert_embedding(text) for text in tqdm(df['text'], desc="Generating BERT Embeddings on GPU")])


Using device: cuda





[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[

KeyboardInterrupt: 




[A[A[A