In [None]:
import nltk
nltk.download('punkt')       
nltk.download('stopwords') 
nltk.download('punkt_tab')  
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import stacy


In [None]:
reviews_df = pd.read_pickle('reviews_raw.pkl')
reviews_df

In [None]:
# pick a random asin from the DataFrame
reviews_df['asin'].sample(1)

In [None]:
# make a df based on the random asin
test_df = reviews_df.loc[reviews_df['asin'] == 'B00ICDB1QO'].copy()
test_df

need to preprocess the text
1. lowercase the text
2. remove punctuation and special characters
3. tokenize
4. stopword removal
5. stemming/lemmatization

In [None]:
# i think it might be best to functionize this

def preprocess_reviews_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))  
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
tfid_df = test_df.copy()
tfid_df['cleaned_text'] = tfid_df['reviewText'].copy().apply(preprocess_reviews_text)
tfid_df

In [None]:
tfid_df['review_length'] = tfid_df['reviewText'].apply(len)
tfid_df['word_count'] = tfid_df['reviewText'].apply(lambda x: len(x.split()))


In [None]:
sid = SentimentIntensityAnalyzer()
tfid_df['sentiment'] = tfid_df['reviewText'].apply(lambda x: sid.polarity_scores(x)['compound'])


In [None]:
tfid_df

In [None]:
vectorizer = TfidfVectorizer(max_features=5)
X_tfidf = vectorizer.fit_transform(tfid_df['cleaned_text'])

In [None]:
# sparse matrix
temp_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())


In [None]:
tfid_df = pd.concat([tfid_df.reset_index(drop=True), temp_df.reset_index(drop=True)], axis=1)

In [None]:
tfid_df.columns

In [None]:
# Example: Mean sentiment, average word count, etc.
agg_tfid_df = tfid_df.agg({
    'review_length': ['mean', 'std'],
    'word_count': ['mean'],
    'sentiment': ['mean'],
    **{col: ['mean'] for col in temp_df.columns}
}).T


spacy time

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
def get_spacy_vector(text):
    return nlp(text).vector


In [None]:
spacy_df = test_df.copy()
spacy_df['spacy_vector'] = spacy_df['reviewText'].apply(get_spacy_vector)

In [None]:
X = np.vstack(spacy_df['spacy_vector'].values)

In [None]:
def extract_text_features(text):
    doc = nlp(text)
    return {
        'word_count': len([t for t in doc if not t.is_punct]),
        'noun_count': sum(1 for t in doc if t.pos_ == 'NOUN'),
        'verb_count': sum(1 for t in doc if t.pos_ == 'VERB'),
        'adj_count': sum(1 for t in doc if t.pos_ == 'ADJ'),
        'avg_word_len': np.mean([len(token.text) for token in doc if not token.is_punct]) if len(doc) > 0 else 0,
        'sentiment': doc.sentiment  #
    }

text_feat_df = spacy_df['reviewText'].apply(extract_text_features).apply(pd.Series)
spacy_df = pd.concat([spacy_df, text_feat_df], axis=1)


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    return sia.polarity_scores(text)['compound']  # Compound is best summary

spacy_df['sentiment'] = spacy_df['reviewText'].apply(vader_sentiment)


In [None]:
spacy_df['sentiment']

In [None]:
tfid_df['sentiment']