In [None]:
import re
import nltk
import spacy
import string
import contractions
import numpy as np
import pandas as pd
import seaborn as sns
import scattertext as st
import plotly.express as px
import matplotlib.pyplot as plt
import dataset_utils
import sentiment_utils

from tqdm import tqdm
from afinn import Afinn
from wordcloud import WordCloud
from collections import Counter
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.sparse import vstack
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfTransformer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Constants

In [None]:
PREPROCESSED_DATASET_PATH = "./dataset/dataset.csv"
DATASET_WITH_SENTIMENT_PATH = "./dataset/dataset_with_sentiment.csv"

# Import the dataset

In [None]:
dataset = pd.read_csv(PREPROCESSED_DATASET_PATH)
dataset

# Compute sentiment scores

In [None]:
new_dataset = sentiment_utils.preprocess_review_dataset(dataset, negation_handling=False, lemmatization=False, vader=False)

In [None]:
new_dataset = sentiment_utils.compute_review_afinn_scores(new_dataset)

In [None]:
new_dataset = sentiment_utils.preprocess_review_dataset(dataset, negation_handling=False, lemmatization=False, vader=True)

In [None]:
new_dataset = sentiment_utils.compute_review_vader_scores(new_dataset, preprocessed=True)

In [None]:
new_dataset = sentiment_utils.compute_review_vader_scores(new_dataset, preprocessed=False)

In [None]:
new_dataset = sentiment_utils.preprocess_review_dataset(dataset, negation_handling=False, lemmatization=True, vader=False)

In [None]:
new_dataset = sentiment_utils.compute_review_afinn_scores(new_dataset)

In [None]:
new_dataset = sentiment_utils.preprocess_summary_dataset(dataset, negation_handling=False, lemmatization=False, vader=False)

In [None]:
new_dataset = sentiment_utils.compute_summary_afinn_scores(new_dataset)

In [None]:
new_dataset = sentiment_utils.preprocess_summary_dataset(dataset, negation_handling=False, lemmatization=False, vader=True)

In [None]:
new_dataset = sentiment_utils.compute_summary_vader_scores(new_dataset, preprocessed=True)

In [None]:
new_dataset = sentiment_utils.compute_summary_vader_scores(new_dataset, preprocessed=False)

In [None]:
new_dataset = sentiment_utils.preprocess_summary_dataset(dataset, negation_handling=False, lemmatization=True, vader=False)

In [None]:
new_dataset = sentiment_utils.compute_summary_afinn_scores(new_dataset)

# Supervised learning predictions

In [None]:
new_dataset = sentiment_utils.preprocess_review_dataset(dataset, negation_handling=True, lemmatization=False, vader=False)

In [None]:
y_true, y_pred, test_indices = sentiment_utils.cross_validation_dataset(new_dataset, reviews=True, undersampling=False, tfidf=False, iterations=1000)

In [None]:
y_true, y_pred, test_indices = sentiment_utils.cross_validation_dataset(new_dataset, reviews=True, undersampling=True, tfidf=False, iterations=1000)

In [None]:
y_true, y_pred, test_indices = sentiment_utils.cross_validation_dataset(new_dataset, reviews=True, undersampling=False, tfidf=True, iterations=1000)

In [None]:
new_dataset = sentiment_utils.preprocess_summary_dataset(dataset, negation_handling=True, lemmatization=False, vader=False)

In [None]:
y_true, y_pred, test_indices = sentiment_utils.cross_validation_dataset(new_dataset, reviews=False, undersampling=False, tfidf=False, iterations=1000)

In [None]:
y_true, y_pred, test_indices = sentiment_utils.cross_validation_dataset(new_dataset, reviews=False, undersampling=True, tfidf=False, iterations=1000)

In [None]:
y_true, y_pred, test_indices = sentiment_utils.cross_validation_dataset(new_dataset, reviews=False, undersampling=False, tfidf=True, iterations=1000)

# Find mismatches between rating and sentiment

In [None]:
new_dataset = sentiment_utils.preprocess_review_dataset(dataset, negation_handling=True, lemmatization=False, vader=False)

In [None]:
y_true, y_pred, test_indices = sentiment_utils.cross_validation_dataset(new_dataset, reviews=True, undersampling=False, tfidf=False, iterations=1000)

In [None]:
sentiment_utils.print_mismatch_examples(new_dataset, y_true, y_pred, test_indices, num_examples=30)

# Save the dataset with the predicted sentiments

In [None]:
new_dataset = sentiment_utils.save_dataset_with_sentiment(new_dataset, y_pred, test_indices, DATASET_WITH_SENTIMENT_PATH)
new_dataset

# Plots with sentiment

In [None]:
dataset = pd.read_csv(DATASET_WITH_SENTIMENT_PATH)

In [None]:
sentiment_utils.plot_sentiments_distribution(dataset)

In [None]:
sentiment_utils.plot_top_products_by_sentiment(dataset, k=20)

In [None]:
sentiment_utils.plot_sentiment_price_relation(dataset)

In [None]:
# We plot the most common words for all the predicted sentiments.
new_dataset = sentiment_utils.preprocess_review_dataset(dataset, negation_handling=True, lemmatization=False, vader=False)
negative_dataset = new_dataset[new_dataset['predictedSentiment'] == 'negative']
neutral_dataset = new_dataset[new_dataset['predictedSentiment'] == 'neutral']
positive_dataset = new_dataset[new_dataset['predictedSentiment'] == 'positive']
sentiment_utils.plot_intersection_most_common_words(negative_dataset, neutral_dataset, positive_dataset, k=20)

# Create scattertexts

In [None]:
#sample_dataset = dataset[:100000]
#sample_dataset = sentiment_utils.preprocess_review_dataset(sample_dataset, lemmatization=False)
#sentiment_utils.create_reviews_scattertext(sample_dataset, year=2007)