Skip to content
Browse files

Crawling now takes days into account

* If a train overshoots a day, crawling is done for the next day
* Added relevant files
  • Loading branch information...
1 parent b6f3dbc commit db80071d7bcdfa44efa8f5f15db5947dffff5d6a @snktagarwal committed Apr 9, 2012
View
1,895 crawl_scripts/daily_delay/NewTrainStationDetail.txt
1,895 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
259 crawl_scripts/daily_delay/Segments.py
@@ -0,0 +1,259 @@
+##
+
+## Disambiguating stations
+
+Delhi_stations = ['NDLS', 'DLI', 'DEC', 'DEE', 'DSA', 'NZM', 'ANVR', 'ANVT' ]
+Kolkata_stations = [ 'HWH', 'SDAH', 'KOAA' ]
+Mumbai_stations = [ 'CSTM', 'BCT', 'BDTS', 'DR', 'DDR', 'LTT' ]
+Chennai_stations = [ 'MAS', 'MS', 'TBM' ]
+Bangalore_stations = [ 'SBC', 'BNC', 'YPR', 'BNCE' ]
+Hyderabad_stations = [ 'HYB', 'SC', 'KCG', 'BMT' ]
+
+## This file lists all the imp segments, with in b/w stations
+
+
+################### Northern Belt #####################
+
+# Amritsar(ASR) - Ambala(UMB)
+asr_umb = ['ASR','JRC','JUC','LDH','UMB']
+
+#Ambala(UMB) - Panipat - Delhi
+umb_delhi = ['UMB','PNP','DELHI']
+
+
+# Ambala(UMB) - Moradabad(MB)
+umb_mb = ['UMB','MB']
+
+# Delhi- Jaipur(JP)
+delhi_jp = ['DELHI','RE','AWR','JP']
+
+# Belt info
+north_belt = [ asr_umb, umb_delhi, umb_mb, delhi_jp ]
+
+
+
+################## Eastern & Gangetic Belt #####################
+
+# Delhi- mathura- Agra(AF/AGC)
+delhi_agc = ['DELHI','MTJ','AGC']
+
+# Delhi- aligarh- tundla - Etawah- Kanpur(CNB)
+delhi_cnb= ['DELHI','ALJN','TDL','ETW','CNB'] # this gives better results than the following two individually
+#delhi_tundla= ['DELHI','ALJN','TDL']
+#tundla_cnb= ['TDL','ETW','CNB']
+
+
+# Delhi- Moradabad(MB)
+delhi_mb= ['DELHI','MB']
+
+# Moradabad(MB)- Lucknow(LKO)
+mb_lko= ['MB','LKO']
+
+# Kanpur(CNB)- Allahabad(ALD)
+cnb_ald= ['CNB','ALD']
+
+
+# Allahabad(ALD)- Mugalsarai(MGS)
+ald_mgs= ['ALD','MGS']
+
+# Kanpur(CNB) - Lucknow(LKO)
+cnb_lko = ['CNB', 'LKO']
+
+# Lucknow(LKO)- Varanasi(BSB)
+lko_bsb = ['LKO','SLN','BSB']
+
+#Varanasi(BSB)- Sonpur(SEE)
+bsb_see= ['BSB','SEE']
+
+# Mugalsarai(MGS)- Ara - Danapur - Patna(PNBE)
+mgs_pnbe= ['MGS','ARA','PNBE']
+
+# Sonpur(SEE)- Hajipur- Barauni- Katihar(KIR)
+see_kir= ['SEE','HJP','BJU','KIR']
+
+# Mugalsarai(MGS)- Gaya(GAYA)
+mgs_gaya= ['MGS','DOS','GAYA']
+
+# Gaya(GAYA)- Gomoh - Dhanbad(DHN)
+gaya_dhn= ['GAYA', 'GMO', 'DHN']
+
+# Garwa Road - Barka Kana - Gomoh
+garwa_gomoh = ['GHD', 'BRKA', 'GMO']
+
+# Patna(PNBE)- Asansol(ASN)
+pnbe_asn= ['PNBE','ASN']
+
+# Dhanbad - Asansol(ASN)- Durgapur - Kolkata
+dhanbad_kolkata= ['DHN', 'ASN','DGR','KOLKATA']
+
+# Kolkata-Malda
+kolkata_mldt= ['KOLKATA','MLDT']
+
+# New Jalpaiguri - New Cooch Behar - New Bongaigon - Guwahati
+njp_guwahati = ['NJP', 'NCB', 'NBQ', 'GHY']
+
+# Belt info
+igp_belt = [delhi_agc, delhi_cnb, delhi_mb, cnb_ald, cnb_lko, ald_mgs, \
+ lko_bsb, mb_lko, bsb_see, mgs_pnbe, see_kir, mgs_gaya, \
+ gaya_dhn, garwa_gomoh, pnbe_asn, dhanbad_kolkata, kolkata_mldt, njp_guwahati]
+
+################# Western Belt #######################
+
+#Jaipur(JP)- Ajmer(AII)- Marwar(MJ)
+jp_mj=['JP','AII','MJ']
+
+#Marwar(MJ)- Abu Rd(ABR) - Ahmedabad(ADI)
+mj_adi= ['MJ','ABR','ADI']
+
+# Ahmedabad(ADI)- Anand(ANND) -Vadodara(BRC)- Surat (ST)
+#adi_surat = ['ADI','ANND','BRC', 'ST']
+adi_surat = ['ADI','BRC', 'ST']
+
+# Ahmedabad(ADI) - Vadodara(BRC)
+#adi_brc = ['ADI', 'BRC' ]
+
+# Vadodara (BRC) - Surat (ST)
+#brc_surat = ['BRC', 'ST']
+
+# Bhusaval(BSL)- Manmad - Kalyan
+bsl_kalyan = ['BSL','MMR','KYN' ]
+
+# Surat- Mumbai(CSTM/BCT)
+st_mumbai = ['ST','MUMBAI']
+
+# Mumbai-Pune(PUNE)
+mumbai_pune = ['MUMBAI','PUNE']
+
+# KOTA(KOTA)- Ratlam(RTM)- Vadodara(BRC)
+kota_brc= ['KOTA','RTM','BRC']
+
+# Agra(AF/AGC)- KOTA(KOTA)
+
+# Belt info
+west_belt = [jp_mj, mj_adi, adi_surat, bsl_kalyan, st_mumbai, mumbai_pune, kota_brc]
+
+
+##################### South-eastern Belt ################
+# Kolkata - Kharagpur(KGP)
+kolkata_kgp = ['KOLKATA','KGP']
+
+# Kharagpur(KGP)- Bhubaneswar(BBS)
+kgp_bbs = ['KGP','BBS']
+
+# Bhubaneswar(BBS)- Vizinagram - Vizag(VSKP) ::
+bbs_vskp = ['BBS','VZM','VSKP']
+
+# Vizag(VSKP) - Rajamundry - Vijaywada(BZA)
+vskp_bza = ['VSKP', 'RJY', 'BZA']
+
+# Vijaywada(BZA) - Guntur - Chennai(MAS/MS)
+bza_chennai = ['BZA', 'GNT', 'CHENNAI']
+
+# Belt info
+se_belt = [kolkata_kgp, kgp_bbs, bbs_vskp, vskp_bza, bza_chennai]
+
+
+##################### Central Belt #################
+
+# Agra(AF/AGC)- Gwalior(GWL) - Jhansi(JHS)
+agc_jhs= ['AGC','GWL','JHS']
+
+# Jhansi(JHS)- Bina - Bhopal(BPL)
+jhs_bpl= ['JHS','BINA','BPL']
+
+# Bina - Katni(KTE)
+bina_kte= ['BINA','KTE']
+
+# Ujjain-Bhopal(BPL)- Itarsi(ET)
+ujn_et= ['UJN', 'BPL', 'ET']
+
+# Katni(KTE)- Jabalpur(JBP)- Itarsi(ET)
+kte_et= ['KTE','JBP','ET']
+
+# Itarsi(ET)- Bhusaval(BSL)
+et_bsl= ['ET','BSL']
+
+
+# Itarsi(ET)- Amla(AMLA) - Nagpur(NGP)
+et_ngp= ['ET','AMLA','NGP']
+
+# Wardha - Kazipet - Secunderabad(SC) - Hyderabad(HYB)
+wardha_hydbad= ['WR', 'KZJ', 'HYDBAD']
+
+# Katni(KTE)- Bilaspur(BSP) - Raipur(R)
+kte_r= ['KTE','BSP','R']
+
+# Bilaspur - Rourkela
+bilaspur_rourkela = [ 'BSP', 'ROU' ]
+
+# Raipur(R) - Titlagarh - Vizianagram - Vishakhapatnam(VSKP)
+r_vskp= ['R', 'TIG', 'VZM', 'VSKP']
+
+# Bhusaval(BSL)-Wardha-Nagpur(NGP)
+bsl_ngp= ['BSL', 'WR', 'NGP']
+
+# Belt info
+central_belt = [ agc_jhs, jhs_bpl, bina_kte, ujn_et, kte_et, et_bsl, et_ngp, \
+ wardha_hydbad, kte_r, bilaspur_rourkela, r_vskp, bsl_ngp]
+
+
+##################### Southern Belt #################
+
+# Pune(PUNE) - Solapur(SUR) - Wadi(WADI) - Secunderabad/Hyderabad(HYB)
+pune_hydbad = ['PUNE','SUR','WADI','HYDBAD']
+
+# Wadi(WADI)- Guntakal(GTL) - Bangalore(SBC/BNC)
+wadi_blore = ['WADI','GTL','BLORE']
+
+# Mumbai- RATNAGIRI(RN)-MADGAON(MAO)
+mumbai_mao= ['MUMBAI','RN','MAO']
+
+# MADGAON(MAO)- MANGALORE(MAQ)
+mao_maq= ['MAO','MAQ']
+
+# MANGALORE(MAQ)-Cannanore-Calicut(CLT)-Ernakulum(ERS)
+maq_ers= ['MAQ', 'CAN', 'CLT','ERS']
+
+# Ernakulum(ERS)-Alleppey-Trivandrum(TVC)
+ers_tvc= ['ERS', 'ALLP', 'TVC']
+
+# Guntakal(GTL) - Cuddapah(HX) - Renigunta(RU) - Chennai(MAS)
+gtl_chennai = ['GTL','HX','RU','CHENNAI']
+
+# Coimbatore - Salem - Chennai
+coimbatore_chennai = [ 'CBE', 'SA', 'CHENNAI' ]
+
+# Belt info
+south_belt = [pune_hydbad, wadi_blore, mumbai_mao, mao_maq, maq_ers, \
+ ers_tvc, gtl_chennai, coimbatore_chennai]
+
+
+###############################################
+# list containing all the above mentioned route segments
+
+all_segments = [ asr_umb, umb_delhi, umb_mb, delhi_jp, delhi_agc, delhi_cnb, delhi_mb, mb_lko, cnb_ald, ald_mgs, cnb_lko, lko_bsb, bsb_see, mgs_pnbe, see_kir, mgs_gaya, gaya_dhn, garwa_gomoh, pnbe_asn, dhanbad_kolkata, kolkata_mldt, njp_guwahati, jp_mj, mj_adi, adi_surat, bsl_kalyan, st_mumbai, kota_brc, kolkata_kgp, kgp_bbs, bbs_vskp, vskp_bza, bza_chennai, agc_jhs, jhs_bpl, bina_kte, ujn_et, kte_et, et_bsl, et_ngp, wardha_hydbad, kte_r, bilaspur_rourkela, r_vskp, bsl_ngp, mumbai_pune, pune_hydbad, wadi_blore, mumbai_mao, mao_maq, maq_ers, ers_tvc, gtl_chennai, coimbatore_chennai ]
+
+all_segments_str = ['asr_umb','umb_delhi','umb_mb','delhi_jp','delhi_agc','delhi_cnb','delhi_mb','mb_lko','cnb_ald','ald_mgs','cnb_lko','lko_bsb','bsb_see','mgs_pnbe','see_kir','mgs_gaya','gaya_dhn','garwa_gomoh','pnbe_asn','dhanbad_kolkata','kolkata_mldt','njp_guwahati','jp_mj','mj_adi','adi_surat','bsl_kalyan','st_mumbai','kota_brc','kolkata_kgp','kgp_bbs','bbs_vskp','vskp_bza','bza_chennai','agc_jhs','jhs_bpl','bina_kte','ujn_et','kte_et','et_bsl','et_ngp','wardha_hydbad','kte_r','bilaspur_rourkela','r_vskp','bsl_ngp','mumbai_pune','pune_hydbad','wadi_blore','mumbai_mao','mao_maq','maq_ers','ers_tvc','gtl_chennai','coimbatore_chennai']
+
+all_belts = [north_belt, south_belt, igp_belt, se_belt, west_belt, central_belt]
+all_belts_str = ["NORTH", "SOUTH", "IGP", "SOUTH EAST", "WEST", "CENTRAL"]
+
+segments_with_max_traffic1 = [asr_umb, umb_delhi, delhi_cnb, delhi_agc, cnb_ald, ald_mgs, mgs_pnbe, see_kir, dhanbad_kolkata, adi_surat, bsl_kalyan, st_mumbai, kolkata_kgp, vskp_bza, agc_jhs, jhs_bpl, kte_et, et_bsl, wardha_hydbad, bza_chennai, lko_bsb, ujn_et ]
+
+segments_with_max_traffic = [ delhi_cnb, adi_surat, jhs_bpl, delhi_agc, bsl_kalyan, dhanbad_kolkata, asr_umb, agc_jhs, ujn_et, kte_et, cnb_ald, st_mumbai, bza_chennai, mgs_pnbe, umb_delhi, mb_lko, vskp_bza, see_kir, coimbatore_chennai, ald_mgs, et_bsl, kolkata_kgp, lko_bsb, mgs_gaya, wardha_hydbad, mumbai_pune, bbs_vskp, kota_brc, kte_r, gaya_dhn, pune_hydbad, delhi_mb, r_vskp, maq_ers, kgp_bbs, et_ngp, bsl_ngp, kolkata_mldt, njp_guwahati, cnb_lko, jp_mj, wadi_blore, delhi_jp, ers_tvc, mj_adi, umb_mb, gtl_chennai, pnbe_asn, bilaspur_rourkela, mumbai_mao, garwa_gomoh, bina_kte, bsb_see, mao_maq ]
+
+if __name__=='__main__':
+
+ # Check if the belts have been defined for all segments
+
+ print len(all_segments)
+ for s in all_segments:
+
+ print s
+ if s in north_belt: print 'North'
+ if s in igp_belt: print 'IGP'
+ if s in west_belt: print 'WEST'
+ if s in se_belt: print 'SE BELT'
+ if s in central_belt: print 'CENTRAL'
+ if s in south_belt: print 'SOUTH'
+
View
62 crawl_scripts/daily_delay/Utilities.py
@@ -0,0 +1,62 @@
+import fileinput
+import sys
+import glob
+import numpy
+# File containing segments of Indian Railways
+import Segments
+
+class UtilitiesStat:
+
+ def __init__(self):
+ self.stn_codes, self.stn_names = self.stnCodes('datasets/AllStationCodes.txt')
+ self.trn_stn = self.trainStnList('datasets/NewTrainStation.txt')
+ self.tr_conv= self.trainNoConv('datasets/TrainNoConv.txt')
+
+ def toMin(self, time):
+
+ try:
+ minutes = int(time.split(':')[0])*60 + int(time.split(':')[1])
+ except:
+ minutes = 0
+
+ return minutes
+
+ def stnCodes(self, filename):
+
+ f = open(filename, 'r')
+ stn_codes = {}
+ stn_names = {}
+
+ for l in f.readlines():
+
+ code, station = l.strip().split('||')
+ stn_codes[station] = code
+ stn_names[code] = station
+
+ return stn_codes, stn_names
+
+ def trainStnList(self, filename):
+
+ trn_stn = {}
+ f = file(filename, 'r')
+
+ for l in f.readlines():
+ p = l.strip().split('||')
+ trn_stn[p[0]] = p[1:]
+ return trn_stn
+
+ def trainNoConv(self, filename):
+ f = file(filename, 'r')
+ tr_conv = {}
+ for l in f.readlines():
+ p = l.strip().split('||')
+ tr_conv[p[0]] = p[1]
+ return tr_conv
+
+ def convTrNo(self, tr_no):
+ """ Currently we have a bit of mess. The train numbers are appended
+ with a 1 and then used to query, which is very incorrect. Hence
+ remove that 1, and then find the actual train it should co-incide with"""
+ return self.tr_conv[tr_no[1:]]
+
+
View
98 crawl_scripts/daily_delay/crawlRunning.py
@@ -2,76 +2,77 @@
import sys
import os
import logging
-import datetime
from BeautifulSoup import BeautifulSoup
import re
from urlparse import urlparse
from html2text import html2text
from sets import Set
import time
+import Utilities
+from datetime import date
+from dateutil.relativedelta import relativedelta
+import copy
+
+util = Utilities.UtilitiesStat()
+
+
+now = date.today() - relativedelta(days = 2)
-now = datetime.datetime.now()
datestring = now.strftime("%Y-%m-%d")
-#datestring = '2012-02-28'
running_info_out = 'RunningInfo-'+datestring+'.out'
-logger = logging.getLogger('myapp')
-hdlr = logging.FileHandler('crawlRunningInfo.log')
-formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
-hdlr.setFormatter(formatter)
-logger.addHandler(hdlr)
-logger.setLevel(logging.INFO)
+print now
ri = file(running_info_out, 'w')
-def parseRunningInfoOutput(filename):
+def parseTrainListNew(filename):
- # parses and returns a list of train numbers already crawled
- crawled = Set()
- ri_out = file(filename, 'r')
+ """ Parses the list of train and stations including information about
+ timings """
- ri_lines = ri_out.readlines()
+ f = file(filename, 'r')
+ lines = f.readlines()
- for l in ri_lines:
+ trainDetails = {}
- parts = l.split(':')
+ for l in lines:
- if parts[0] == 'Train Number':
+ parts = l.split()
+ tr_no = parts[0]
+ parts = parts[1:]
+ trainDetails[tr_no] = []
- crawled.add(str(parts[1].strip()))
+ while(len(parts)>0):
+ stn_code, sch_arr, sch_dep, src_dist = parts[0:4]
+ if sch_arr == 'Source': sch_arr = -1
+ else: sch_arr = util.toMin(sch_arr)
+ if sch_dep == 'Desitation': sch_dep = -1
+ else: sch_dep = util.toMin(sch_dep)
+ trainDetails[tr_no].append([stn_code, sch_arr, sch_dep])
+ parts = parts[4:]
- return crawled
+ return trainDetails
-def parseTrainList(filename, type):
+def crawlDataWithDate(trainDetails, path):
- # Crawls the file based on whether it is old or new.
- # 'old' or 'new'
- f = file(filename, 'r')
- trainList = f.readlines()
- trainDetails = {}
- for t in trainList:
+ for (k,v) in trainDetails.iteritems():
- parts = t.strip().split('||')
- trainNo = parts[0]
- stnList = parts[1:]
+ d = date.today() - relativedelta(days = 2)
+ prev_arr = -10
- if type == 'old':
+ for t in v:
- key = trainNo
- trainDetails[key] = stnList
+ # Check if the train changes day
+ if t[1] < prev_arr: d = d + relativedelta(days = +1)
+ prev_arr = t[1]
- return trainDetails
+ d1 = d.isoformat().split('-')
-def crawlDataWithDate(trainDetails, d, path):
- d1 = d.isoformat().split('-')
- for (k,v) in trainDetails.iteritems():
- for station in v:
+ station = t[0]
crawlHTTP = "http://www.trainenquiry.com/o/RunningIslTrSt.aspx?tr="+str(k)+"&st="+str(station)+"+&dt="+d1[2]+"%2f"+d1[1]+"%2f"+d1[0]
- logger.info('Fetching: '+crawlHTTP)
- crawlOP = path+'/'+str(k)+'.'+str(station)+'.'+str(d)+'.html'
- logger.info('Putting: '+crawlOP)
+ crawlOP = path+'/'+str(k)+'.'+str(station)+'.'+str(now)+'.html'
wgetCall = 'wget -o wget.log -O '+crawlOP+' \"'+crawlHTTP+'\"'
- logger.info('Wget Call: '+wgetCall)
+ print wgetCall
os.system(wgetCall)
parseFile(crawlOP, str(k))
@@ -117,18 +118,13 @@ def backupRunningInfo(filename):
if __name__=='__main__':
+ trainDetails = parseTrainListNew('NewTrainStationDetail.txt')
filename = 'RunningInfo-'+datestring+'.out'
- # Always backup before doing any changes. Backup
- # to the current time
- #backupRunningInfo('RunningInfo-2011-08-25.out')
- #crawled = parseRunningInfoOutput('RunningInfo-2011-08-25.out')
- trainDetails = parseTrainList(sys.argv[1],sys.argv[2])
+
print trainDetails
+
print 'Total trains: '+str(len(trainDetails))
- #print 'Crawled Already: '+str(len(crawled))
- if sys.argv[3] == 'recover':
- trainDetails = pruneCrawledTrains(trainDetails, crawled)
- print 'List to crawl: '+str(len(trainDetails))
+
os.system('mkdir ./crawl-'+datestring)
- crawlDataWithDate(trainDetails, datetime.date(now.year,now.month,now.day),'./crawl-'+datestring)
+ crawlDataWithDate(trainDetails, './crawl-'+datestring)

0 comments on commit db80071

Please sign in to comment.
Something went wrong with that request. Please try again.