# Method 1

In [None]:
from bs4 import BeautifulSoup
import requests
import re

# Download IMDB's Top 250 data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
votes = [b.attrs.get('data-value') for b in soup.select('td.ratingColumn strong')]

imdb = []


# Store each item into dictionary (data), then put those into a list (imdb)
for index in range(0, len(movies)):
    # Seperate movie into: 'place', 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"movie_title": movie_title,
            "year": year,
            "place": place,
            "star_cast": crew[index],
            "rating": ratings[index],
            "vote": votes[index],
            "link": links[index]}
    imdb.append(data)


In [None]:
imdb

[{'movie_title': 'The Shawshank Redemption',
  'year': '1994',
  'place': '1',
  'star_cast': 'Frank Darabont (dir.), Tim Robbins, Morgan Freeman',
  'rating': '9.22216705827553',
  'vote': None,
  'link': '/title/tt0111161/'},
 {'movie_title': 'The Godfather',
  'year': '1972',
  'place': '2',
  'star_cast': 'Francis Ford Coppola (dir.), Marlon Brando, Al Pacino',
  'rating': '9.148913507904668',
  'vote': None,
  'link': '/title/tt0068646/'},
 {'movie_title': 'The Godfather: Part II',
  'year': '1974',
  'place': '3',
  'star_cast': 'Francis Ford Coppola (dir.), Al Pacino, Robert De Niro',
  'rating': '8.981090710216321',
  'vote': None,
  'link': '/title/tt0071562/'},
 {'movie_title': 'The Dark Knight',
  'year': '2008',
  'place': '4',
  'star_cast': 'Christopher Nolan (dir.), Christian Bale, Heath Ledger',
  'rating': '8.967495342945185',
  'vote': None,
  'link': '/title/tt0468569/'},
 {'movie_title': '12 Angry Men',
  'year': '1957',
  'place': '5',
  'star_cast': 'Sidney Lumet 

In [None]:
import pandas as pd

In [None]:
imdb_df = pd.DataFrame(imdb, columns=["movie_title",
            "year",
            "place",
            "star_cast",
            "rating",
            "vote",
            "link"])
imdb_df.head()

Unnamed: 0,movie_title,year,place,star_cast,rating,vote,link
0,The Shawshank Redemption,1994,1,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.22216705827553,,/title/tt0111161/
1,The Godfather,1972,2,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.148913507904668,,/title/tt0068646/
2,The Godfather: Part II,1974,3,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.981090710216321,,/title/tt0071562/
3,The Dark Knight,2008,4,"Christopher Nolan (dir.), Christian Bale, Heat...",8.967495342945185,,/title/tt0468569/
4,12 Angry Men,1957,5,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.921553062696638,,/title/tt0050083/


In [None]:
imdb_df.to_csv('movie.csv')

# Method 2

In [None]:
import scrapy
from scrapy.loader import ItemLoader
from imbd.items import imdbItem
import mysql.connector
import csv


class BasicSpider(scrapy.Spider):
    name = 'basic'
    allowed_domains = ['www.imdb.com']
    start_urls = ['https://www.imdb.com/search/title/?year=2014,2018&title_type=feature&sort=moviemeter,asc/']
    
    def parse(self, response):
        
        links = response.xpath('//*[@class="lister-item mode-advanced"]')
        
        for link in links:
            l = ItemLoader(item=imdbItem(), selector=link)
            l.add_xpath('title', './/*[@class="lister-item-header"]/a/text()')
            l.add_xpath('genres', './/*[@class="genre"]/text()')
            l.add_xpath('year', './/*[@class="lister-item-year text-muted unbold"]/text()')
            l.add_xpath('director', './/*[@class="lister-item-content"]/p[contains(text(), "Director")]/a[following-sibling::text()[contains(., "Stars")]]/text()')
            l.add_xpath('stars', './/*[@class="lister-item-content"]//a[preceding-sibling::text()[contains(., "Stars")]]/text()')
            l.add_xpath('runtime', './/*[@class="runtime"]/text()')
            l.add_xpath('rating', './/*[@class="inline-block ratings-imdb-rating"]/@data-value')
            l.add_xpath('vote', './/*[@class="sort-num_votes-visible"]/span[2]/@data-value')
            l.add_xpath('gross', './/*[@class="sort-num_votes-visible"]/span[5]/@data-value')
            l.add_xpath('metascore', './/*[@class="inline-block ratings-metascore"]/span/text()')
            
            yield l.load_item()
            
        next_page_url = response.xpath('//*[@class="lister-page-next next-page"]/@href').extract_first()
        absolute_next_page_url = response.urljoin(next_page_url)
        yield scrapy.Request(absolute_next_page_url)
                