In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Brandon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Brandon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Brandon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data = pd.read_csv('kindle_reviews.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [5]:
# Preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocess_text(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ""
    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize
    words = word_tokenize(text)
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [8]:
# Apply preprocessing
data['cleaned_review'] = data['reviewText'].apply(preprocess_text)

In [9]:
# Create sentiment labels: 1-2 stars = negative (0), 4-5 stars = positive (1)
data['sentiment'] = data['overall'].apply(lambda x: 0 if x <= 2 else 1 if x >= 4 else -1)
data = data[data['sentiment'] != -1] # Remove neutral reviews

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
 data['cleaned_review'], data['sentiment'], test_size=0.2, random_state=42
)

In [11]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [13]:
# Make predictions and evaluate
predictions = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Detailed Report:\n", classification_report(y_test, predictions))

Accuracy: 0.9669740812815523
Detailed Report:
               precision    recall  f1-score   support

           0       0.85      0.61      0.71     11588
           1       0.97      0.99      0.98    165697

    accuracy                           0.97    177285
   macro avg       0.91      0.80      0.84    177285
weighted avg       0.96      0.97      0.96    177285



In [14]:
# Optional: Display important words
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]
word_importance = pd.DataFrame({'word': feature_names, 'coefficient': coefficients})
print("\nTop 5 Positive Words:")
print(word_importance.sort_values(by='coefficient', ascending=False).head())
print("\nTop 5 Negative Words:")
print(word_importance.sort_values(by='coefficient').head())


Top 5 Positive Words:
           word  coefficient
2730      loved    12.574663
1500    enjoyed    12.098120
4794       wait    11.110763
1581  excellent     9.010500
1977      great     8.780345

Top 5 Negative Words:
               word  coefficient
4823          waste    -9.718770
3326         poorly    -8.110274
1175        deleted    -7.563847
4687  unfortunately    -7.518289
1287  disappointing    -7.173624


In [18]:
# Function to analyze a new sentence
def analyze_sentence(sentence, vectorizer, model):
    # Preprocess the sentence
    cleaned_sentence = preprocess_text(sentence)
    
    # Transform the sentence using the trained TF-IDF vectorizer
    sentence_tfidf = vectorizer.transform([cleaned_sentence])
    
    # Predict sentiment
    prediction = model.predict(sentence_tfidf)[0]
    sentiment = "Positive" if prediction == 1 else "Negative"
    
    # Get prediction probability (confidence)
    proba = model.predict_proba(sentence_tfidf)[0]
    confidence = proba[prediction] * 100
    
    # Extract influential words (optional)
    feature_names = vectorizer.get_feature_names_out()
    coefficients = model.coef_[0]
    word_contributions = {}
    
    # Tokenize the cleaned sentence to check for influential words
    tokens = cleaned_sentence.split()
    for word in tokens:
        if word in feature_names:
            idx = vectorizer.vocabulary_[word]
            coef = coefficients[idx]
            word_contributions[word] = coef
    
    return {
        "sentence": sentence,
        "sentiment": sentiment,
        "confidence": f"{confidence:.2f}%",
        "influential_words": word_contributions
    }

# Example usage
sentence = "The poorly developed main character is unfortunately great; totally deleted my boredom."
result = analyze_sentence(sentence, vectorizer, model)
print("\nSentence Analysis:")
print(f"Sentence: {result['sentence']}")
print(f"Sentiment: {result['sentiment']}")
print(f"Confidence: {result['confidence']}")
print("Influential Words and Coefficients:")
for word, coef in result['influential_words'].items():
    print(f"  {word}: {coef:.3f}")


Sentence Analysis:
Sentence: The poorly developed main character is unfortunately great; totally deleted my boredom.
Sentiment: Negative
Confidence: 99.97%
Influential Words and Coefficients:
  poorly: -8.110
  developed: -1.243
  main: -1.480
  character: -0.893
  unfortunately: -7.518
  great: 8.780
  totally: 0.144
  deleted: -7.564
