Scrapping data for sentiment analysis

In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
import time
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import pandas as pd


#Replaced chromedriver with edgedriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))

driver.get("https://www.rottentomatoes.com/m/black_panther_2018/reviews")
reviewText=[]


# Creates "load more" button object.
wait = WebDriverWait(driver, 10)
load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='reviews']/div[3]/rt-button[2]")))

while True:
    try:
        page_source = driver.page_source
        soup = bs(page_source, 'html.parser')
        
        reviewTable = soup.find("div", {"class": "review_table"})
        reviewText += [t.get_text() for t in reviewTable.select(".review-row .review-text")]
        load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@id='reviews']/div[3]/rt-button[2]")))
        load_more_button.click()
        time.sleep(5)
    except:
        break

#close the browser
driver.quit()

#Creating a pandas database
df = pd.DataFrame(reviewText, columns=['Review'])
print(df)

                                                Review
0    Marvel movies proved they could keep on evolvi...
1    A heady mix of William Shakespeare and Walt Di...
2    [The film's] central struggle is one that you ...
3    Wakanda Forever is the big screen outing that ...
4    But “Black Panther” isn’t just a cultural stat...
..                                                 ...
526  It's gripping, funny, and full of spectacle, b...
527  Not everything about Black Panther works acros...
528  Ryan Coogler's superhero adventure is a crowd-...
529  While many Marvel films feel like small pieces...
530  Innovative, intelligent and empowering. When I...

[531 rows x 1 columns]


Data Cleaning

In [67]:
import re
import nltk

#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
corpus=[]

for index, row in df.iterrows():
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

Data Transformation

In [69]:
# Loading BoW dictionary
from sklearn.feature_extraction.text import CountVectorizer
import pickle
cvFile='./c1_BoW_Sentiment_Model.pkl'
# cv = CountVectorizer(decode_error="replace", vocabulary=pickle.load(open('./drive/MyDrive/Colab Notebooks/2 Sentiment Analysis (Basic)/3.1 BoW_Sentiment Model.pkl', "rb")))
cv = pickle.load(open(cvFile, "rb"))

In [70]:
X_fresh = cv.transform(corpus).toarray()
X_fresh.shape

(531, 1420)

Predictions

In [71]:
import joblib
classifier = joblib.load('./c2_Classifier_Sentiment_Model')

In [72]:
y_pred = classifier.predict(X_fresh)
print(y_pred)

['positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'positive'
 'positive' 'positive' 'positive' 'positive' 'positive' 'posit

In [73]:
df['Predicted Label'] = y_pred.tolist()

Saving Output

In [74]:
df.to_csv("./Sentiment_Analysis_Output.csv", sep='\t', encoding='UTF-8', index=False)

In [75]:
print(df)

                                                Review Predicted Label
0    Marvel movies proved they could keep on evolvi...        positive
1    A heady mix of William Shakespeare and Walt Di...        positive
2    [The film's] central struggle is one that you ...        positive
3    Wakanda Forever is the big screen outing that ...        positive
4    But “Black Panther” isn’t just a cultural stat...        positive
..                                                 ...             ...
526  It's gripping, funny, and full of spectacle, b...        positive
527  Not everything about Black Panther works acros...        positive
528  Ryan Coogler's superhero adventure is a crowd-...        positive
529  While many Marvel films feel like small pieces...        positive
530  Innovative, intelligent and empowering. When I...        positive

[531 rows x 2 columns]
