Skip to content

Commit

Permalink
Included 2018 US movies via '2018 in film' page, formatted movies.jso…
Browse files Browse the repository at this point in the history
…n & included 1930-2018 movies
  • Loading branch information
prust committed Oct 5, 2018
1 parent f0e6229 commit f7b29fd
Show file tree
Hide file tree
Showing 3 changed files with 187,425 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

JSON data on American movies scraped from Wikipedia.

`movies.json` contains all the data from Wikipedia in convenient form: an array of objects, each representing a movie, with a `title` string, `year` integer, a `director` string, a `cast` string, a `genre` string and a `notes` string -- each representing the text content of those respective columns in the Wikipedia tables:
`movies.json` contains a list of 1930-2018 movies from Wikipedia in convenient form: an array of objects, each representing a movie, with a `title` string, `year` integer, a `director` string, a `cast` string, a `genre` string and a `notes` string -- each representing the text content of those respective columns in the Wikipedia tables:

```javascript
[
Expand Down
27 changes: 22 additions & 5 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ var cheerio = require('cheerio');
var async = require('async');

var years = [];
for (var year = 1900; year < 2017; year++)
for (var year = 1930; year <= 2018; year++)
years.push(year);

async.mapSeries(years, scrapeMoviesForYear, function(err, results) {
Expand All @@ -21,7 +21,15 @@ async.mapSeries(years, scrapeMoviesForYear, function(err, results) {
function scrapeMoviesForYear(year, callback) {
// setTimeout() so wikipedia doesn't hate us for slamming their servers
setTimeout(function() {
request('https://en.wikipedia.org/wiki/List_of_American_films_of_' + year, function(err, res, body) {
console.log('loading movies from ' + year);

var url;
if (year == 2018)
url = 'https://en.wikipedia.org/wiki/2018_in_film';
else
url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_' + year;

request(url, function(err, res, body) {
if (err)
throw err;
if (res.statusCode != 200)
Expand Down Expand Up @@ -60,7 +68,16 @@ function scrapeMoviesForYear(year, callback) {
var director_cell = title_cell.next();
var cast_cell = director_cell.next();
var genre_cell = cast_cell.next();
var notes_cell = genre_cell.next();
if (year == 2018) {
var country_cell = genre_cell.next();

// filter to just US films, like the other years
if (country_cell.text().indexOf('US') == -1)
return;
}
else {
var notes_cell = genre_cell.next();
}

var movie_data = {
title: title_cell.text(),
Expand All @@ -73,7 +90,7 @@ function scrapeMoviesForYear(year, callback) {
movies.push(movie_data);

var m = movie_data;
console.log(m.title + ':', m.director, m.cast, m.genre, m.notes);
// console.log(m.title + ':', m.director, m.cast, m.genre, m.notes);
});
});

Expand All @@ -83,7 +100,7 @@ function scrapeMoviesForYear(year, callback) {
}

function toCommaDelimitedList(cell) {
var text = cell.text().trim();
var text = cell && cell.text().trim();
if (text)
return text.split('\n').join(', ');
else
Expand Down
187,403 changes: 187,402 additions & 1 deletion movies.json

Large diffs are not rendered by default.

0 comments on commit f7b29fd

Please sign in to comment.