In [1]:
!pip install transformers umap-learn hdbscan scikit-learn pandas

Collecting transformers
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m231.2 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.23.5-py3-none-any.whl.metadata (12 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting numba>=0.51.2 (from umap-learn)
  Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import umap
import hdbscan
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load the dataframe
df = pd.read_csv('sentiAnalysis.csv')  # Adjust the path to your data
descriptions = df['Description'].astype(str)  # Ensure all descriptions are strings
time_to_fix = df['TimeLabel']

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to compute BERT embeddings
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

# Get BERT embeddings for the descriptions
embeddings = get_bert_embeddings(descriptions)

# Reduce dimensionality with UMAP
umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')


In [None]:

# Cluster the embeddings with HDBSCAN
cluster_model = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
clusters = cluster_model.fit_predict(umap_embeddings)

# Add the cluster labels as a new feature to the dataframe
df['topic'] = clusters

# Prepare the data for classification
X = pd.get_dummies(df['topic'], prefix='topic')
y = df['TimeLabel']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predict the time to fix class on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(report)