<a href="https://colab.research.google.com/github/sbogde/pandamonium/blob/main/WS03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wordcloud

In [None]:
import csv
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import warnings

time.sleep(2)
warnings.filterwarnings('ignore')

## Define URLs for Scraping IMDB Reviews

urls = [
    'https://www.imdb.com/title/tt0111161/reviews/?ref_=tt_ql_urv',
    'https://www.imdb.com/title/tt1213644/reviews/?ref_=tt_ql_urv',
    'https://www.imdb.com/title/tt6856242/reviews/?ref_=tt_ql_urv',
    'https://www.imdb.com/title/tt8356942/reviews/?ref_=tt_ql_urv',
    'https://www.imdb.com/title/tt0060666/reviews/?ref_=tt_ql_urv'
]

# content = []
# for url in urls:
#     page = requests.get(url, timeout=2.50)
#     soup = BeautifulSoup(page.content, 'html.parser')
#     content.append(soup.find_all('div', class_='review-container'))

# Initialize lists to store extracted reviews and ratings
reviews_list = []
ratings_list = []

# Iterate over each movie URL
for url in urls:
    print(f"Scraping: {url}")
    headers = {'User-Agent': 'Mozilla/5.0'}  # Prevent getting blocked
    page = requests.get(url, headers=headers)

    # Parse the page content
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find all review containers
    review_containers = soup.find_all('article', class_='sc-7d2e5b85-1 cvfQlw user-review-item')

    # Extract review text and rating
    for container in review_containers:
        # Extract review text
        review_element = container.find('div', class_='ipc-html-content-inner-div')
        review_text = review_element.get_text(strip=True) if review_element else "No Review"

        # Extract rating
        rating_element = container.find('span', class_='ipc-rating-star--rating')
        rating = rating_element.get_text(strip=True) if rating_element else "NA"

        # Append to lists
        reviews_list.append(review_text)
        ratings_list.append(rating)

        # print(f"Review: {review_text[:100]}...")  # Print preview
        # print(f"Rating: {rating}\n")

    # Sleep to prevent being blocked
    time.sleep(2)

# Create DataFrame
movie = pd.DataFrame({'Review': reviews_list, 'Rating': ratings_list})

# Save to CSV
movie.to_csv('Workshop3_IMDB_Dataset.csv', index=False)

# Display first few rows
print(movie.head())
print(movie.info())

# Convert Rating to string first, then replace non-numeric values safely
movie['Rating'] = pd.to_numeric(movie['Rating'], errors='coerce')

# Fill NaN values with a default value (e.g., 0) before converting to integer
movie['Rating'] = movie['Rating'].fillna(0).astype(int)

### Extract Reviews and Ratings
## Text Processing and Analysis
### Importing Required Libraries
import string
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

### WordCloud Visualization

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

text = ', '.join(t for t in movie['Review'])
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color='white').generate(text)

plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

##Sentiment Identification using VADER

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
sentiments = []

for review in movie['Review']:
    score = sid.polarity_scores(review)['compound']
    if score >= 0.05:
        sentiments.append('positive')
    elif score <= -0.05:
        sentiments.append('negative')
    else:
        sentiments.append('neutral')

movie['Sentiment'] = sentiments

## Sentiment Classification using Machine Learning

movie['class-label'] = movie['Rating'].astype(int).apply(lambda x: '1' if x > 5 else ('-1' if x < 5 else '0'))
movie = movie[movie['class-label'] != '0']

tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
X = tfidf_vectorizer.fit_transform(movie['Review'])
y = movie['class-label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

## Train and Evaluate SVM Classifier

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Train and Evaluate Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))