# Data Scraping

In [47]:
# Import Requests
import requests

# Import Beautiful Soup
from bs4 import BeautifulSoup

In [83]:
# Execute request
# If you’re using a different site just replace the url e.g. r=requests.get(‘put your url in here’)
response = requests.get('https://www.imdb.com/title/tt4154796/reviews/?ref_=ttexr_ql_2')

In [84]:
# Check request status
print(response.status_code)

200


In [86]:
# Check result
response.content



In [87]:
soup = BeautifulSoup(response.content, 'html.parser')

In [88]:
review_data = soup.findAll(class_="review-container")

In [89]:
review_data

[<div class="review-container">
 <div class="lister-item-content">
 <div class="ipl-ratings-bar">
 <span class="rating-other-user-rating">
 <svg class="ipl-icon ipl-star-icon" fill="#000000" height="24" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg">
 <path d="M0 0h24v24H0z" fill="none"></path>
 <path d="M12 17.27L18.18 21l-1.64-7.03L22 9.24l-7.19-.61L12 2 9.19 8.63 2 9.24l5.46 4.73L5.82 21z"></path>
 <path d="M0 0h24v24H0z" fill="none"></path>
 </svg>
 <span>7</span><span class="point-scale">/10</span>
 </span>
 </div>
 <a class="title" href="/review/rw4814867/"> Not as good as infinity war..
 </a> <div class="display-name-date">
 <span class="display-name-link"><a href="/user/ur19484559/">MoistMovies</a></span><span class="review-date">29 April 2019</span>
 </div>
 <div class="content">
 <div class="text show-more__control">But its a pretty good film. A bit of a mess in some parts, lacking the cohesive and effortless feel infinity war somehow managed to accomplish.

In [95]:
#Scraping and converting to list
reviews = []

for review in review_data:
    reviews1 = review.find('div',class_='lister-item-content')
    reviews2 = reviews1.find('div',class_="content")
    reviews.append(reviews2.find('div',class_="text show-more__control").text)

In [96]:
reviews

["But its a pretty good film. A bit of a mess in some parts, lacking the cohesive and effortless feel infinity war somehow managed to accomplish. Some silly plot holes and characters that could've been cut (Ahem, captain marvel and thanos). The use of Captain marvel in this film was just ridiculous. Shes there at the start, bails for some reason? And then pops up at the end to serve no purpose but deux ex machina a space ship...The past thanos part wasn't necessary when you think about it, they could've done this film without a real 'villain' And it would've worked and been a original take on a super hero film without using the same formula we have seen in 10+ super hero films already. The entire final battle felt like a rehashed messy battle, with a bunch more characters against thanos - basically mixing the space battle from infinity war; iron man, spider man etc and the final battle of infinity war with captain america, thor etc.. mashing those two into one within this new film. The

In [97]:
len(reviews)

25

# Data Analysis

In [98]:
#Import Pandas and Numpy
import numpy as np
import pandas as pd

In [99]:
df = pd.DataFrame(np.array(reviews),columns=['reviews'])

In [100]:
df.head()

Unnamed: 0,reviews
0,But its a pretty good film. A bit of a mess in...
1,Rating: 8.6Not as good as Infinity war pacing-...
2,The soon-to-be most successful movie of all ti...
3,This film is an emotional rollercoaster with s...
4,"After watching Infinity war, I was looking for..."


In [101]:
#Total 10 reviews
len(df['reviews'])

25

### Word Count for each review

In [102]:
df['Word Count'] = df['reviews'].apply(lambda x:len(x.split()))

In [103]:
df.head()

Unnamed: 0,reviews,Word Count
0,But its a pretty good film. A bit of a mess in...,462
1,Rating: 8.6Not as good as Infinity war pacing-...,60
2,The soon-to-be most successful movie of all ti...,212
3,This film is an emotional rollercoaster with s...,60
4,"After watching Infinity war, I was looking for...",17


### Character count for each review

In [104]:
df['Char count'] = df['reviews'].apply(lambda x:len(x))

In [105]:
df.head()

Unnamed: 0,reviews,Word Count,Char count
0,But its a pretty good film. A bit of a mess in...,462,2496
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355
2,The soon-to-be most successful movie of all ti...,212,1151
3,This film is an emotional rollercoaster with s...,60,324
4,"After watching Infinity war, I was looking for...",17,98


In [106]:
def AverageLength(x):
    words = x.split()
    return sum(len(word) for word in words) / len(words)  

In [107]:
df['Average Word Length'] = df['reviews'].apply(lambda x: AverageLength(x))

In [108]:
df.head()

Unnamed: 0,reviews,Word Count,Char count,Average Word Length
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962
3,This film is an emotional rollercoaster with s...,60,324,4.416667
4,"After watching Infinity war, I was looking for...",17,98,4.823529


### Stopwords count and rate

In [109]:
import nltk

In [110]:
from nltk.corpus import stopwords

In [111]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arany\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [112]:
stop_words = stopwords.words('english')

In [113]:
df['stopwords_count'] = df['reviews'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
df['stopwords_rate'] = df['stopwords_count'] / df['Word Count']

In [114]:
df.head()

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762,195,0.422078
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333,22,0.366667
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962,80,0.377358
3,This film is an emotional rollercoaster with s...,60,324,4.416667,24,0.4
4,"After watching Infinity war, I was looking for...",17,98,4.823529,5,0.294118


In [115]:
df.sort_values(by='stopwords_rate')

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate
8,Rating 10/10\nAbsolute perfection end game !! ...,40,228,4.725,9,0.225
4,"After watching Infinity war, I was looking for...",17,98,4.823529,5,0.294118
16,3 hours out of my life I will never get back.\...,40,194,3.875,14,0.35
18,They've really scrapped the bottom with this o...,11,60,4.545455,4,0.363636
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333,22,0.366667
19,great performancesincredible visualsstory ends...,30,196,5.566667,11,0.366667
7,"If you're going to watch this movie, avoid any...",98,536,4.479592,36,0.367347
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962,80,0.377358
17,Avengers: Endgame is above all the best Marvel...,42,227,4.428571,16,0.380952
22,"So here we have it, AVENGERS: ENDGAME, the exp...",175,942,4.388571,68,0.388571


# Data Cleaning

In [121]:
#Converting all words to lower case
df['reviews_lowercased'] = df['reviews'].apply(lambda x: " ".join(word.lower() for word in x.split()))

In [122]:
df.head()

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate,reviews_lowercased
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762,195,0.422078,but its a pretty good film. a bit of a mess in...
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333,22,0.366667,rating: 8.6not as good as infinity war pacing-...
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962,80,0.377358,the soon-to-be most successful movie of all ti...
3,This film is an emotional rollercoaster with s...,60,324,4.416667,24,0.4,this film is an emotional rollercoaster with s...
4,"After watching Infinity war, I was looking for...",17,98,4.823529,5,0.294118,"after watching infinity war, i was looking for..."


In [129]:
#Removing punctuations
df['non_punc_reviews'] = df['reviews_lowercased'].str.replace('[^\w\s]', '')

  df['non_punc_reviews'] = df['reviews_lowercased'].str.replace('[^\w\s]', '')


In [132]:
#Removing stopwords
df['non_stopwords_reviews'] = df['non_punc_reviews'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words)) 

In [133]:
df.head()

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate,reviews_lowercased,non_punc_reviews,non_stopwords_reviews
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762,195,0.422078,but its a pretty good film. a bit of a mess in...,but its a pretty good film a bit of a mess in ...,pretty good film bit mess parts lacking cohesi...
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333,22,0.366667,rating: 8.6not as good as infinity war pacing-...,rating 86not as good as infinity war pacingwis...,rating 86not good infinity war pacingwise sati...
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962,80,0.377358,the soon-to-be most successful movie of all ti...,the soontobe most successful movie of all time...,soontobe successful movie times actually ranks...
3,This film is an emotional rollercoaster with s...,60,324,4.416667,24,0.4,this film is an emotional rollercoaster with s...,this film is an emotional rollercoaster with s...,film emotional rollercoaster coolest superhero...
4,"After watching Infinity war, I was looking for...",17,98,4.823529,5,0.294118,"after watching infinity war, i was looking for...",after watching infinity war i was looking forw...,watching infinity war looking forward much tim...


In [134]:
#Now finding words that are appearing many number of times , but they are not providing any value(Called recurrent words)

In [140]:
pd.Series(" ".join(df['non_stopwords_reviews']).split()).value_counts()[:40]

movie         35
thanos        32
war           31
endgame       30
infinity      30
time          27
film          27
marvel        26
like          25
avengers      24
characters    20
one           20
good          20
films         18
even          17
many          16
would         16
great         15
see           15
thor          14
say           14
iw            13
story         13
much          12
two           12
really        12
cant          11
seen          11
end           11
back          11
plot          11
well          11
scenes        10
last          10
cap           10
know          10
think         10
felt          10
get           10
dont          10
dtype: int64

In [145]:
# Now manually removing the words that dont add meaning 
recurring_words = ['like','one','even','would','say','iw','much','really','know','felt','two','one','see']

In [146]:
df['cleaned_reviews'] = df['non_stopwords_reviews'].apply(lambda x: " ".join(word for word in x.split() if word not in recurring_words))

In [147]:
df.head()

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate,reviews_lowercased,non_punc_reviews,non_stopwords_reviews,cleaned_reviews
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762,195,0.422078,but its a pretty good film. a bit of a mess in...,but its a pretty good film a bit of a mess in ...,pretty good film bit mess parts lacking cohesi...,pretty good film bit mess parts lacking cohesi...
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333,22,0.366667,rating: 8.6not as good as infinity war pacing-...,rating 86not as good as infinity war pacingwis...,rating 86not good infinity war pacingwise sati...,rating 86not good infinity war pacingwise sati...
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962,80,0.377358,the soon-to-be most successful movie of all ti...,the soontobe most successful movie of all time...,soontobe successful movie times actually ranks...,soontobe successful movie times actually ranks...
3,This film is an emotional rollercoaster with s...,60,324,4.416667,24,0.4,this film is an emotional rollercoaster with s...,this film is an emotional rollercoaster with s...,film emotional rollercoaster coolest superhero...,film emotional rollercoaster coolest superhero...
4,"After watching Infinity war, I was looking for...",17,98,4.823529,5,0.294118,"after watching infinity war, i was looking for...",after watching infinity war i was looking forw...,watching infinity war looking forward much tim...,watching infinity war looking forward time sti...


# Lemmatization

In [155]:
#pip install textblob
#nltk.download('wordnet')

In [156]:
#Importing textblob
from textblob import Word

In [158]:
df['lemmatized_review'] = df['cleaned_reviews'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [159]:
df.head()

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate,reviews_lowercased,non_punc_reviews,non_stopwords_reviews,cleaned_reviews,lemmatized_review
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762,195,0.422078,but its a pretty good film. a bit of a mess in...,but its a pretty good film a bit of a mess in ...,pretty good film bit mess parts lacking cohesi...,pretty good film bit mess parts lacking cohesi...,pretty good film bit mess part lacking cohesiv...
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333,22,0.366667,rating: 8.6not as good as infinity war pacing-...,rating 86not as good as infinity war pacingwis...,rating 86not good infinity war pacingwise sati...,rating 86not good infinity war pacingwise sati...,rating 86not good infinity war pacingwise sati...
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962,80,0.377358,the soon-to-be most successful movie of all ti...,the soontobe most successful movie of all time...,soontobe successful movie times actually ranks...,soontobe successful movie times actually ranks...,soontobe successful movie time actually rank f...
3,This film is an emotional rollercoaster with s...,60,324,4.416667,24,0.4,this film is an emotional rollercoaster with s...,this film is an emotional rollercoaster with s...,film emotional rollercoaster coolest superhero...,film emotional rollercoaster coolest superhero...,film emotional rollercoaster coolest superhero...
4,"After watching Infinity war, I was looking for...",17,98,4.823529,5,0.294118,"after watching infinity war, i was looking for...",after watching infinity war i was looking forw...,watching infinity war looking forward much tim...,watching infinity war looking forward time sti...,watching infinity war looking forward time sti...


# Sentiment Analysis

In [160]:
from textblob import TextBlob

In [162]:
#Two outputs : Polarity and Subjectivity
#Polarity : -1 to +1 - How positive or negetive a review is.
#Subjectivity : 0 to +1 - How much the text based on factual information or it is just a generic opinion

In [163]:
# If we are analyzing normal reviews , the review should have less subjectivity (more user or personal opinions) 
# If we are analyzing famous critic reviews, subjectivity should be high and negetive polarity should be taken, so that
# the negetive points of the business can be focused on and worked on

In [167]:
#First value is polarity , second is subjectivity
df['lemmatized_review'].apply(lambda x : TextBlob(x).sentiment)

0      (0.11251942501942502, 0.5445804195804197)
1                  (0.2632291666666667, 0.62375)
2      (0.31555555555555553, 0.6599999999999999)
3     (0.08333333333333333, 0.40277777777777785)
4                                     (1.0, 1.0)
5      (0.14794703595724004, 0.5015549076773568)
6     (-0.08303571428571428, 0.6157738095238096)
7      (0.17500000000000002, 0.8250000000000001)
8      (0.29375000000000007, 0.5562500000000001)
9       (0.2776436781609195, 0.5893103448275864)
10      (0.2849137931034483, 0.5882594417077176)
11     (0.21041666666666667, 0.4623737373737373)
12      (0.2040784832451499, 0.4851851851851851)
13     (0.06889311837587703, 0.5417291137980791)
14     (0.10572390572390573, 0.3872895622895623)
15       (0.3153343782654127, 0.556896551724138)
16     (0.09999999999999999, 0.3666666666666667)
17     (0.5750000000000001, 0.33888888888888885)
18     (-0.6999999999999998, 0.6666666666666666)
19      (0.4028409090909091, 0.5136363636363637)
20     (0.1861111111

In [168]:
df['Polarity'] = df['lemmatized_review'].apply(lambda x : TextBlob(x).sentiment[0])

In [169]:
df['Subjectivity'] = df['lemmatized_review'].apply(lambda x : TextBlob(x).sentiment[1])

In [170]:
df.head()

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate,reviews_lowercased,non_punc_reviews,non_stopwords_reviews,cleaned_reviews,lemmatized_review,Polarity,Subjectivity
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762,195,0.422078,but its a pretty good film. a bit of a mess in...,but its a pretty good film a bit of a mess in ...,pretty good film bit mess parts lacking cohesi...,pretty good film bit mess parts lacking cohesi...,pretty good film bit mess part lacking cohesiv...,0.112519,0.54458
1,Rating: 8.6Not as good as Infinity war pacing-...,60,355,4.933333,22,0.366667,rating: 8.6not as good as infinity war pacing-...,rating 86not as good as infinity war pacingwis...,rating 86not good infinity war pacingwise sati...,rating 86not good infinity war pacingwise sati...,rating 86not good infinity war pacingwise sati...,0.263229,0.62375
2,The soon-to-be most successful movie of all ti...,212,1151,4.433962,80,0.377358,the soon-to-be most successful movie of all ti...,the soontobe most successful movie of all time...,soontobe successful movie times actually ranks...,soontobe successful movie times actually ranks...,soontobe successful movie time actually rank f...,0.315556,0.66
3,This film is an emotional rollercoaster with s...,60,324,4.416667,24,0.4,this film is an emotional rollercoaster with s...,this film is an emotional rollercoaster with s...,film emotional rollercoaster coolest superhero...,film emotional rollercoaster coolest superhero...,film emotional rollercoaster coolest superhero...,0.083333,0.402778
4,"After watching Infinity war, I was looking for...",17,98,4.823529,5,0.294118,"after watching infinity war, i was looking for...",after watching infinity war i was looking forw...,watching infinity war looking forward much tim...,watching infinity war looking forward time sti...,watching infinity war looking forward time sti...,1.0,1.0


In [171]:
df.describe()

Unnamed: 0,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate,Polarity,Subjectivity
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,227.24,1254.92,4.533736,98.8,0.402984,0.187977,0.552234
std,352.9063,1955.86577,0.317075,167.36736,0.066982,0.28677,0.149526
min,11.0,60.0,3.875,4.0,0.225,-0.7,0.338889
25%,42.0,228.0,4.404762,18.0,0.367347,0.1,0.454545
50%,136.0,746.0,4.479592,57.0,0.405172,0.186111,0.54458
75%,298.0,1616.0,4.609756,134.0,0.424051,0.29375,0.615774
max,1790.0,9901.0,5.566667,852.0,0.580645,1.0,1.0


Observations:<br>
1. Polarity has max value of 1 , so very good reviews are there
2. Polarity has min value of -0.7 , so some negetive reviews are also there
3. Subjectivity has max value of 1 , so some facts based reviews are present
4. Subjectivity has min value of 0.33 , so some user opinion reviews are also present

In [174]:
# Removing the columns that are no longer required
df = df.drop(['reviews_lowercased','non_punc_reviews','non_stopwords_reviews','lemmatized_review'],axis=1)

In [178]:
df.sort_values(by='Polarity')

Unnamed: 0,reviews,Word Count,Char count,Average Word Length,stopwords_count,stopwords_rate,cleaned_reviews,Polarity,Subjectivity
18,They've really scrapped the bottom with this o...,11,60,4.545455,4,0.363636,theyve scrapped bottom oh bad,-0.7,0.666667
23,"I had to take several breaks, walk the dogs, p...",126,691,4.492063,53,0.420635,take several breaks walk dogs play videogames ...,-0.126515,0.454545
6,Perhaps Infinity War was the film of Thanos in...,151,828,4.490066,75,0.496689,perhaps infinity war film thanos imposed seeki...,-0.083036,0.615774
24,Only a month or so back I was talking to a fri...,316,1680,4.31962,134,0.424051,month back talking friend hate serious movies ...,0.025806,0.569636
13,I've just come from watching Endgame and I mus...,1790,9901,4.531844,852,0.475978,ive come watching endgame must disappointedi l...,0.068893,0.541729
3,This film is an emotional rollercoaster with s...,60,324,4.416667,24,0.4,film emotional rollercoaster coolest superhero...,0.083333,0.402778
16,3 hours out of my life I will never get back.\...,40,194,3.875,14,0.35,3 hours life never get back worth hype think s...,0.1,0.366667
14,The film has too many references to previous M...,82,459,4.609756,35,0.426829,film many references previous marvel films fri...,0.105724,0.38729
0,But its a pretty good film. A bit of a mess in...,462,2496,4.404762,195,0.422078,pretty good film bit mess parts lacking cohesi...,0.112519,0.54458
5,"After Avengers Infinity War, we waited for the...",464,2714,4.851293,188,0.405172,avengers infinity war waited avengers endgame ...,0.147947,0.501555
