Skip to content

Commit

Permalink
Updated for new version of psp.cz web
Browse files Browse the repository at this point in the history
  • Loading branch information
Libor Nenadál committed Nov 11, 2012
1 parent 4e50bf2 commit 24771f0
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 147 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Expand Up @@ -3,4 +3,6 @@
.pydevproject
.scrapy
.settings
*.pyc
.idea
*.pyc
*.iml
1 change: 1 addition & 0 deletions psp_cz/__init__.py
@@ -0,0 +1 @@
from .database import Base
2 changes: 1 addition & 1 deletion psp_cz/database.py
Expand Up @@ -15,5 +15,5 @@ def init_db():
# import all modules here that might define models so that
# they will be registered properly on the metadata. Otherwise
# you will have to import them first before calling init_db()
import models
import psp_cz_models
Base.metadata.create_all(bind=engine)
34 changes: 21 additions & 13 deletions psp_cz/pipelines.py
@@ -1,17 +1,22 @@
import re

import hashlib
from sqlalchemy.orm.exc import NoResultFound
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals, log
from scrapy.exceptions import DropItem
from psp_cz.database import db_session, init_db
from psp_cz.items import ParlMembVote, Voting, Sitting, ParlMemb
from psp_cz.models import Voting as TVoting, \
ParlMemb as TParlMemb, \
ParlMembVoting as TParlMembVoting, \
Sitting as TSitting, \
Region as TRegion, \
PolitGroup as TPolitGroup

from .database import init_db
from .database import db_session
from .items import ParlMembVote
from .items import Voting
from .items import Sitting
from .items import ParlMemb
from .psp_cz_models import Voting as TVoting
from .psp_cz_models import ParlMemb as TParlMemb
from .psp_cz_models import ParlMembVoting as TParlMembVoting
from .psp_cz_models import Sitting as TSitting
from .psp_cz_models import Region as TRegion
from .psp_cz_models import PolitGroup as TPolitGroup

# Define your item pipelines here
#
Expand Down Expand Up @@ -121,7 +126,9 @@ def process_item(self, item, spider):
parl_memb = TParlMemb(url=item['url'],
name_full=item['name'],
born=item['born'],
picture_hash=item['images'][0]['checksum'],
# file is named by url hash - see
# ImagesPipeline.image_key()
picture_hash=hashlib.sha1(item['image_urls'][0]).hexdigest(),
gender=item['gender'],
region=region,
polit_group=polit_group)
Expand All @@ -131,7 +138,9 @@ def process_item(self, item, spider):
parl_memb.url = item['url']
parl_memb.name_full = item['name']
parl_memb.born = item['born']
parl_memb.picture_hash = item['images'][0]['checksum']
# file is named by url hash - see
# ImagesPipeline.image_key()
parl_memb.picture_hash = hashlib.sha1(item['image_urls'][0]).hexdigest()
parl_memb.gender = item['gender']
parl_memb.region = region
parl_memb.polit_group = polit_group
Expand All @@ -154,7 +163,6 @@ def get_db_voting(self, item):
def get_db_parl_memb(self, item):
"""Helper procedure that fetches DB ParlMemb entity based on ParlMembVote or ParlMemb Item"""
url = None
parlMemb = None
if isinstance(item, ParlMembVote):
url = item['parl_memb_url']
elif isinstance(item, ParlMemb):
Expand All @@ -164,7 +172,7 @@ def get_db_parl_memb(self, item):
parlMemb = db_session.query(TParlMemb).filter_by(url=url).one()
except NoResultFound:
# search for urls with further parameters
parlMemb = db_session.query(TParlMemb).filter(TParlMemb.url.like(url+'&%')).one()
parlMemb = db_session.query(TParlMemb).filter(TParlMemb.url.like(url+'&%')).first()
return parlMemb

def get_db_parl_memb_vote(self, item):
Expand Down
57 changes: 7 additions & 50 deletions psp_cz/models.py → psp_cz/psp_cz_models.py
@@ -1,13 +1,16 @@
# coding=utf-8
from sqlalchemy import func, Column, Integer, String, Date, BigInteger, DateTime
from sqlalchemy import func, Column, Integer, String, Date, DateTime
from sqlalchemy.orm import relationship, object_mapper, ColumnProperty
from sqlalchemy.schema import ForeignKey, UniqueConstraint, Index
from database import Base
import datetime

from . import Base

class BaseMixin(object):
id = Column(Integer, primary_key=True)

created = Column(DateTime, nullable=False, server_default=func.now())
last_modified = Column(DateTime, nullable=False, default=func.now())

def values(self):
"""JSON serialization of model attributes.
Expand All @@ -24,7 +27,7 @@ def values(self):
result[key] = value.isoformat()

return result

class Sitting(BaseMixin, Base):
__tablename__ = 'sitting'
__table_args__ = (
Expand Down Expand Up @@ -53,7 +56,6 @@ class Voting(BaseMixin, Base):
sitting_id = Column(Integer, ForeignKey('sitting.id'), nullable=False)

parlMembVotings = relationship('ParlMembVoting', backref='voting')
votingReviews = relationship('VotingReview', backref='voting')

class ParlMemb(BaseMixin, Base):
__tablename__ = 'parl_memb'
Expand Down Expand Up @@ -84,51 +86,6 @@ class ParlMembVoting(BaseMixin, Base):
parl_memb_id = Column(Integer, ForeignKey('parl_memb.id'), nullable=False)
voting_id = Column(Integer, ForeignKey('voting.id'), nullable=False)

class UserVoting(BaseMixin, Base):
__tablename__ = 'user_voting'
__table_args__ = (
UniqueConstraint('user_id', 'voting_review_id'),
Index('ix_uv_user_id', 'user_id'),
Index('ix_uv_voting_review_id', 'voting_review_id'),
)

vote = Column(String(1), nullable=False)
user_id = Column(Integer, ForeignKey('app_user.id'), nullable=False)
voting_review_id = Column(Integer, ForeignKey('voting_review.id'), nullable=False)
created = Column(DateTime, nullable=False, server_default=func.now())

class VotingReview(BaseMixin, Base):
__tablename__ = 'voting_review'
__table_args__ = (
UniqueConstraint('voting_id', 'user_id'),
Index('ix_vr_user_id', 'user_id'),
)
title = Column(String(160), nullable=False)
reasoning = Column(String)
voting_id = Column(Integer, ForeignKey('voting.id'), nullable=False)
user_id = Column(Integer, ForeignKey('app_user.id'), nullable=False)
vote_sugg = Column(String(1), nullable=False)
created = Column(DateTime, nullable=False, server_default=func.now())

userVotings = relationship('UserVoting', backref='votingReview')

class User(BaseMixin, Base):
__tablename__ = 'app_user'
__table_args__ = (
Index('ix_usr_fb_id', 'fb_id'),
)
fb_id = Column(BigInteger, unique=True)
name = Column(String(255))
first_name = Column(String(55))
last_name = Column(String(200))
url = Column(String(4000))
gender = Column(String(10))
created = Column(DateTime, nullable=False, server_default=func.now())
last_modified = Column(DateTime, nullable=False, default=func.now)

votingReviews = relationship('VotingReview', backref='user')
userVotings = relationship('UserVoting', backref='user')

class Region(BaseMixin, Base):
__tablename__ = 'region'

Expand Down
5 changes: 2 additions & 3 deletions psp_cz/settings.py
Expand Up @@ -5,18 +5,17 @@
#
# http://doc.scrapy.org/topics/settings.html
#
import os

BOT_NAME = 'psp_cz'
BOT_VERSION = '1.0'

SPIDER_MODULES = ['psp_cz.spiders']
NEWSPIDER_MODULE = 'psp_cz.spiders'
DEFAULT_ITEM_CLASS = 'scrapy.item.Item'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline', 'psp_cz.pipelines.DBStorePipeline']
WEBSERVICE_ENABLED = False
TELNETCONSOLE_ENABLED = False
IMAGES_STORE = 'd:/devel/moji-poslanci/static/images/mp'
IMAGES_STORE = os.environ['IMAGES_STORE']
IMAGES_THUMBS = {
'small': (50, 50),
}
33 changes: 17 additions & 16 deletions psp_cz/spiders/poslanci_psp_cz_spider.py
Expand Up @@ -9,7 +9,6 @@
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc

from psp_cz.database import db_session
from psp_cz.items import ParlMemb

class PoslanciPspCzSpider(CrawlSpider):
Expand All @@ -24,7 +23,7 @@ class PoslanciPspCzSpider(CrawlSpider):

rules = (
# follow links to parliamentary political groups
Rule(SgmlLinkExtractor(allow=('\/snem.sqw',)), callback='parse_parl_polit_groups', follow=False),
Rule(SgmlLinkExtractor(allow=('\/snem.sqw\?.*id\=',)), callback='parse_parl_polit_groups', follow=False),
)

def parse_parl_polit_groups(self, response):
Expand All @@ -33,15 +32,19 @@ def parse_parl_polit_groups(self, response):
hxs = HtmlXPathSelector(response)
base_url = get_base_url(response)

memb_links = hxs.select('/html/body/div[3]/div[2]/div[2]/table/tbody/tr')
memb_links = hxs.select('/html/body/div[2]/div/div/table/tbody//tr')

for member_link in memb_links:
region = member_link.select('td[4]/a/text()').extract()[0]
region_url = urljoin_rfc(base_url, member_link.select('td[4]/a/@href').extract()[0])
group = member_link.select('td[6]/a/text()').extract()[0]
group_long = member_link.select('td[6]/a/@title').extract()[0]
group_url = urljoin_rfc(base_url, member_link.select('td[6]/a/@href').extract()[0])
request_url = urljoin_rfc(base_url, member_link.select('td[2]/a/@href').extract()[0])
region = member_link.select('td[1]/a/text()').extract()[0]
region_url = urljoin_rfc(base_url, member_link.select('td[1]/a/@href').extract()[0])
group = member_link.select('td[2]/a/text()').extract()[0]
group_long = member_link.select('td[2]/a/@title').extract()[0]
group_url = urljoin_rfc(base_url, member_link.select('td[2]/a/@href').extract()[0])
request_url = urljoin_rfc(base_url, member_link.select('th/a/@href').extract()[0])
# There is one exception on Miroslava Němcová detail page which has
# totally different structure. This URL parameter forces the same
# structure for everyone, no exception for chairperson.
request_url += '&zk=7'

request = Request(request_url,
self.parse_parl_memb,
Expand All @@ -62,27 +65,25 @@ def parse_parl_memb(self, response):
hxs = HtmlXPathSelector(response)
base_url = get_base_url(response)

picture_relative_url = hxs.select('/html/body/div[3]/div[2]/div[2]/table/tr/td[2]/a/img/@src').extract()[0]
# There is one exception when Miroslava Němcová has her birth date not in 4th cell
# but in the 2nd. Unfortunate, but whatever, I can get over it...
born_n_gender = hxs.select('/html/body/div[3]/div[2]/div[2]/table/tr').select('//td/text()').re(r'(Narozen.*)')[0]
picture_relative_url = hxs.select('//*[@id="main-content"]/div/div/div/a/img/@src').extract()[0]
born_n_gender = hxs.select('//*[@id="main-content"]/div/div/div/div/p/strong/text()').re(r'(Narozen.*)')[0]
gender = None
if born_n_gender.find('Narozen:') != -1:
gender = 'M'
elif born_n_gender.find('Narozena:') != -1:
gender = 'F'

born = datetime.strptime(born_n_gender.split(' ', 1)[1], '%d.\xc2\xa0%m.\xc2\xa0%Y')
born = datetime.strptime(born_n_gender.split(' ', 1)[1], '%d.%m.%Y')

parl_memb = ParlMemb()
parl_memb['id'] = response.meta['id']
parl_memb['url'] = response.meta['url']
parl_memb['url'] = response.meta['url'][:-5] # remove &zk=7
parl_memb['region'] = response.meta['region']
parl_memb['region_url'] = response.meta['region_url']
parl_memb['group'] = response.meta['group']
parl_memb['group_long'] = response.meta['group_long']
parl_memb['group_url'] = response.meta['group_url']
parl_memb['name'] = hxs.select('/html/body/div[3]/div/h2/text()').extract()[0]
parl_memb['name'] = hxs.select('//*[@id="main-content"]/h1/text()').extract()[0]
parl_memb['born'] = born
parl_memb['gender'] = gender
parl_memb['image_urls'] = [urljoin_rfc(base_url, picture_relative_url)]
Expand Down

0 comments on commit 24771f0

Please sign in to comment.