#**Practice Problem set 2**

1. Create a dataset using an API with Python (Use Web Scrapping/web crawling to create
your own dataset) from anyone (discussed in class) of the following application domains.
* a. IMDB
* b. Flipkart
* c. Amazon
* d. Twitter

#**Creating a dataset of Top 500 movie's IMDB reviews using Web Scrapping**

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
headers = {'Accept-Language':'en-US,en;q=0.5'}

In [3]:
page = np.arange(1,500,50)

In [4]:
pages = []
for x in page:
  pages.append(f"https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&start={x}&ref_=adv_nxt")

In [5]:
links = []
for x in pages:
  response = requests.get(x)
  soup = BeautifulSoup(response.text,"html.parser")

  for movie in soup.select(".lister-item-header"):
    tag = movie.select_one("a")["href"]
    link = f"https://www.imdb.com{tag}reviews?ref_=tt_urv"
    links.append(link)

In [6]:
np.count_nonzero(links)

500

In [None]:
reviews = []
for x in links:
  session = requests.Session()
  response = requests.get(x)
  soup = BeautifulSoup(response.text, 'html.parser')
  review_headings = soup.select(".title")
  for s in review_headings:
      review = s.text.strip()
      reviews.append(review)
  load_more = soup.find('div', class_='load-more-data')
  base_url = load_more['data-ajaxurl']
  data_key = load_more['data-key'] if len(review_headings)>25 else ''
  load_more_url = f"https://m.imdb.com{base_url}?ref_=undefined&paginationKey={data_key}"
  while load_more_url:
    response = session.post(load_more_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    review_headings = soup.select(".title")
    for s in review_headings:
      review = s.text.strip()
      reviews.append(review)
    load_more = soup.find('div', class_='load-more-data')
    if not load_more:
      break
    data_key = load_more['data-key']
    load_more_url = f"https://m.imdb.com{base_url}?ref_=undefined&paginationKey={data_key}"

In [None]:
np.count_nonzero(reviews)

In [None]:
review_data_frame = pd.DataFrame({
    'Review':reviews
})

In [None]:
review_data_frame.to_csv('/content/drive/MyDrive/Colab Notebooks/ML_LAB_PPS_2.0/IMDB_MOVIE_REVIEWS_DATASET_UNCLASSIFIED.csv')

In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np

In [8]:
# initialize the sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# define a function to classify a movie review as positive or negative
def classify_review(review):
    score = sid.polarity_scores(review)
    if score['compound'] >= 0:
      return 'positive'
    else:
      return 'negative'

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [10]:
# Open CSV file
file_path = "/content/drive/MyDrive/Colab Notebooks/ML_LAB_PPS_2.0/IMDB_MOVIE_REVIEWS_DATASET_UNCLASSIFIED.csv"
df = pd.read_csv(file_path)

In [11]:
df.head()

Unnamed: 0,Review,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,Some birds aren't meant to be caged.,,,,,,,,,,,,
1,An incredible movie. One that lives with you.,,,,,,,,,,,,
2,Don't Rent Shawshank.,,,,,,,,,,,,
3,This is How Movies Should Be Made,,,,,,,,,,,,
4,A classic piece of unforgettable film-making.,,,,,,,,,,,,


In [12]:
df = df.iloc[:, :1]
df

Unnamed: 0,Review
0,Some birds aren't meant to be caged.
1,An incredible movie. One that lives with you.
2,Don't Rent Shawshank.
3,This is How Movies Should Be Made
4,A classic piece of unforgettable film-making.
...,...
505671,"Very good technically, but depressing."
505672,Pointless nihilistic bullshit
505673,"good movie, but wished there had been less foc..."
505674,Splendid


In [13]:
# drop rows with null values
df = df.dropna()

# drop rows containing numbers
df = df[~df.astype(str).apply(lambda x: x.str.isnumeric()).any(axis=1)]

In [14]:
Sentiments_data = []
for index, row in df.iterrows():
    sentiment = classify_review(row['Review'])
    Sentiments_data.append(sentiment)

In [15]:
np.count_nonzero(Sentiments_data)

505423

In [16]:
df['Sentiment_Labels'] = Sentiments_data

In [17]:
df

Unnamed: 0,Review,Sentiment_Labels
0,Some birds aren't meant to be caged.,positive
1,An incredible movie. One that lives with you.,positive
2,Don't Rent Shawshank.,positive
3,This is How Movies Should Be Made,positive
4,A classic piece of unforgettable film-making.,positive
...,...,...
505671,"Very good technically, but depressing.",negative
505672,Pointless nihilistic bullshit,negative
505673,"good movie, but wished there had been less foc...",positive
505674,Splendid,positive


In [19]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/ML_LAB_PPS_2.0/IMDB_MOVIE_REVIEWS_DATASET_CLASSIFIED.csv',index=False)