# WEB SCRAPING and SENTIMENT ANALYSIS on Severeus Snape Frome Harry Potter Movies

In [53]:
!pip install requests
!pip install beautifulsoup4
!pip install lxml




In [54]:
import requests
from bs4 import BeautifulSoup

# Specify the URL of the website
url = 'https://harrypotter.fandom.com/wiki/Severus_Snape'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'lxml')  # Use 'lxml' or 'html5lib' as per your preference

    #Extracting paragraph
    paragraphs = soup.find_all('p')
    for paragraph in paragraphs:
        print(paragraph.text)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


At least some content in this article is derived from information featured in: Harry Potter: Hogwarts Mystery & Harry Potter: Puzzles & Spells & Harry Potter: Magic Awakened.As such, spoilers will be present within the article.









Severus Snape


Biographical information

Born
9 January 1960[1]Spinner's End, Cokeworth, Midlands, England, Great Britain


Died
2 May 1998 (aged 38)Shrieking Shack, Hogsmeade, Highlands, Scotland, Great Britain[2]


Blood status
Half-blood[3]


Marital status
Single


Nationality
English


Also known as
Half-Blood Prince[4]Snivellus/Snivelly[5][6][7] (by Marauders)Sev[7] (by Lily Potter)Slytherus[8] (Potterwatch)The Potions Master[9]


Title(s)
Professor[10]Head of Slytherin House (formerly)Headmaster (formerly)



Physical information

Species
Human[10]


Gender
Male[10]


Hair colour
Black[11][12]


Eye colour
Black[13][14]


Skin colour
Sallow[11][12]



Relationship information

Family members
Tobias Snape (father)Eileen Snape (née Prince) (mother

In [55]:
import nltk

# Download the nltk punkt data for sentence tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [56]:
sentences = []
for paragraph in paragraphs:
 sentences.extend(nltk.sent_tokenize(paragraph.text))

   

In [57]:
import re

In [58]:
cleaned_sentences = [re.sub(r'\[.*?\]', '', sentence) for sentence in sentences]

In [59]:
import pandas as pd
import numpy as np

In [60]:
#Create a DataFrame
data = {'Sentences': cleaned_sentences}
df = pd.DataFrame(data)

# Save DataFrame to a CSV file
df.to_csv('sentences.csv', index=False)

In [61]:
df.head(10)

Unnamed: 0,Sentences
0,At least some content in this article is deriv...
1,\n\n\n\n\n\n\n\nSeverus Snape\n\n\nBiographica...
2,Professor Severus Snape (9 January 1960 – 2 Ma...
3,His double life played an extremely important ...
4,The only child of Muggle Tobias Snape and witc...
5,He met Lily and Petunia Evans when he was nine.
6,He fell deeply in love with Lily upon their me...
7,
8,"Severus started at Hogwarts with Lily in 1971,..."
9,This put him in the same year as Lily but unfo...


## Data Cleaning

In [62]:
df.isnull().sum()

Sentences    0
dtype: int64

In [63]:
#Remove null values
cleaned_df = df.dropna(subset=df.columns, how='all')

In [64]:
cleaned_df.isnull().sum()

Sentences    0
dtype: int64

In [65]:
cleaned_df.head(10)

Unnamed: 0,Sentences
0,At least some content in this article is deriv...
1,\n\n\n\n\n\n\n\nSeverus Snape\n\n\nBiographica...
2,Professor Severus Snape (9 January 1960 – 2 Ma...
3,His double life played an extremely important ...
4,The only child of Muggle Tobias Snape and witc...
5,He met Lily and Petunia Evans when he was nine.
6,He fell deeply in love with Lily upon their me...
7,
8,"Severus started at Hogwarts with Lily in 1971,..."
9,This put him in the same year as Lily but unfo...


Remove unwanted rows in Excel

### Sentiment Analysis

In [66]:
snape_life = pd.read_excel('snape_life_updated.xlsx')

In [67]:
snape_life.head(10)

Unnamed: 0,Sentences
0,Professor Severus Snape (9 January 1960 - 2 Ma...
1,His double life played an extremely important ...
2,The only child of Muggle Tobias Snape and witc...
3,He met Lily and Petunia Evans when he was nine.
4,He fell deeply in love with Lily upon their me...
5,"Severus started at Hogwarts with Lily in 1971,..."
6,This put him in the same year as Lily but unfo...
7,Severus became the immediate enemy of James Po...
8,This led him to be irritable towards James's s...
9,"Snape, when young, developed a passion for the..."


In [73]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

# Assuming 'snape_life' is your DataFrame
snape_life['Sentences'] = snape_life['Sentences'].fillna('')  # Replace NaN with an empty string

# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Create a new column 'Sentiment' with sentiment scores
snape_life['Sentiment'] = snape_life['Sentences'].apply(lambda x: sia.polarity_scores(x)['compound'])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [74]:
threshold_angry = -0.7
threshold_anguish = -0.5
threshold_contempt = -0.05
threshold_worry = 0
threshold_ambiguity = 0.1
threshold_affection = 0.2  
threshold_sacrifice = 0.3
threshold_protectiveness = 0.5
threshold_sarcasm = 0.6

# Create conditions and corresponding labels
conditions = [
    snape_life['Sentiment'] <= threshold_angry,
    (snape_life['Sentiment'] > threshold_angry) & (snape_life['Sentiment'] <= threshold_anguish),
    (snape_life['Sentiment'] > threshold_anguish) & (snape_life['Sentiment'] <= threshold_contempt),
    (snape_life['Sentiment'] > threshold_contempt) & (snape_life['Sentiment'] <= threshold_worry),
    (snape_life['Sentiment'] > threshold_worry) & (snape_life['Sentiment'] <= threshold_ambiguity),
    (snape_life['Sentiment'] > threshold_ambiguity) & (snape_life['Sentiment'] <= threshold_affection),
    (snape_life['Sentiment'] > threshold_affection) & (snape_life['Sentiment'] <= threshold_sacrifice),
    (snape_life['Sentiment'] > threshold_sacrifice) & (snape_life['Sentiment'] <= threshold_protectiveness),
    (snape_life['Sentiment'] > threshold_protectiveness) & (snape_life['Sentiment'] <= threshold_sarcasm),
    snape_life['Sentiment'] > threshold_sarcasm
]

labels = ['angry', 'anguish', 'contempt', 'worry','ambiguity', 'affection', 'sacrifice', 'protectiveness', 'sarcasm', 'unknown']

# Create a new column with the sentiment categories
snape_life['Emotion_Category'] = np.select(conditions, labels, default='unknown')


In [75]:
emotion_percentages = snape_life['Emotion_Category'].value_counts(normalize=True) * 100
emotion_percentages

worry             22.509960
contempt          19.322709
angry             14.940239
anguish           12.549801
protectiveness    10.059761
unknown            9.063745
sarcasm            3.784861
sacrifice          3.486056
ambiguity          2.191235
affection          2.091633
Name: Emotion_Category, dtype: float64

### TOP 5 FREQUENT WORDS

In [76]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [77]:
all_text = ' '.join(snape_life['Sentences'].astype(str))

# Tokenize the text into words
words = word_tokenize(all_text)

# Remove stop words and punctuation
stop_words = set(stopwords.words('english'))
filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

# Lemmatize the words
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

# Calculate the frequency distribution of words
freq_dist = FreqDist(lemmatized_words)

# Find the top 5 most common words
top_words = freq_dist.most_common(5)

# Print the top 5 most common words and their frequencies
for word, frequency in top_words:
    print(f"The word '{word}' appears {frequency} times.")

The word 'snape' appears 703 times.
The word 'harry' appears 245 times.
The word 'dumbledore' appears 173 times.
The word 'voldemort' appears 170 times.
The word 'lily' appears 125 times.


In [37]:
snape_life.to_csv('snape_life_dataset.csv')