In [1]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os
import json
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from textblob import TextBlob

In [2]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [3]:
# Text preprocessing function using spaCy
def preprocess_text(text):
    doc = nlp(text.lower())  # Process the text with spaCy
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and token.text not in string.punctuation]
    processed_text = ' '.join(tokens)
    return processed_text

In [4]:
# Sentiment analysis function using TextBlob
def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment >= 0.05:
        return 'positive'
    elif sentiment <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [5]:
# data = pd.read_csv('RateMyProfessor_data.csv')

# # Check for duplicate rows
# duplicate_rows = data[data.duplicated()]

# # Print the number of duplicate rows
# print(f"Number of duplicate rows: {len(duplicate_rows)}")

# print(data.shape)
# data.columns

In [6]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rmp-ai",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [7]:
# Load the review data
with open("reviews.json") as file:
    data = json.load(file)

In [8]:
# Initialize OpenAI client
client = OpenAI()

# Create embeddings for each review
processed_data = []

for review in data:  # `data` is a list, so iterate directly over it
    processed_review = preprocess_text(review['review'])
    sentiment = analyze_sentiment(review['review'])
    
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata": {
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
                "processed_review": processed_review,
                "sentiment": sentiment
            }
        }
    )

# Print or use `processed_data` as needed
print(processed_data[0])

{'values': [0.014487707056105137, 0.0009171545971184969, 0.016261983662843704, 0.04354316368699074, 0.01111924834549427, 0.021157922223210335, 0.030923116952180862, 0.03124328702688217, -0.013227036222815514, 0.01206641923636198, 0.0030566260684281588, 0.025253433734178543, 0.012406599707901478, -0.04485052451491356, 0.005019336938858032, 0.042929504066705704, 0.01747596263885498, -0.02621394582092762, 0.03409814089536667, 0.05469576269388199, 0.03241724520921707, -0.005616321228444576, 0.026894306764006615, -0.0205442626029253, -0.04642469808459282, -0.05397538095712662, 0.010318823158740997, 0.023719284683465958, -0.019383644685149193, -0.013333760201931, 0.07097108662128448, -0.026293987408280373, 0.005036012269556522, -0.036392692476511, -0.03428490459918976, 0.04895937815308571, 0.0009221572545357049, 0.024306263774633408, 0.01844981499016285, 0.004689161200076342, 0.03399141505360603, 0.03135000914335251, -0.027401244267821312, 0.013320419937372208, 0.008204364217817783, -0.01340

In [9]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rmp-ai")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

print(f"Upserted count: {upsert_response['upserted_count']}")

Upserted count: 49


In [10]:
# Print index statistics
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
