In [1]:
import pandas as pd
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

In [2]:
station_dict = {'松山機場': '松山機場_0',
                 '中山國中': '中山國中_1',
                 '南京復興': '南京復興_2',
                 '忠孝復興': '忠孝復興_3',
                 '大安': '大安_4',
                 '科技大樓': '科技大樓_5',
                 '六張犁': '六張犁_6',
                 '麟光': '麟光_7',
                 '辛亥': '辛亥_8',
                 '萬芳醫院': '萬芳醫院_9',
                 '萬芳社區': '萬芳社區_10',
                 '木柵': '木柵_11',
                 '動物園': '動物園_12',
                 '大直': '大直_13',
                 '劍南路': '劍南路_14',
                 '西湖': '西湖_15',
                 '港墘': '港墘_16',
                 '文德': '文德_17',
                 '內湖': '內湖_18',
                 '大湖公園': '大湖公園_19',
                 '葫洲': '葫洲_20',
                 '東湖': '東湖_21',
                 '南港軟體園區': '南港軟體園區_22',
                 '南港展覽館': '南港展覽館_23',
                 '小碧潭': '小碧潭_24',
                 '新店': '新店_25',
                 '新店區公所': '新店區公所_26',
                 '七張': '七張_27',
                 '大坪林': '大坪林_28',
                 '景美': '景美_29',
                 '萬隆': '萬隆_30',
                 '公館': '公館_31',
                 '台電大樓': '台電大樓_32',
                 '古亭': '古亭_33',
                 '中正紀念堂': '中正紀念堂_34',
                 '小南門': '小南門_35',
                 '頂溪': '頂溪_36',
                 '永安市場': '永安市場_37',
                 '景安': '景安_38',
                 '南勢角': '南勢角_39',
                 '台大醫院': '台大醫院_40',
                 '台北車站': '台北車站_41',
                 '中山': '中山_42',
                 '雙連': '雙連_43',
                 '民權西路': '民權西路_44',
                 '圓山': '圓山_45',
                 '劍潭': '劍潭_46',
                 '士林': '士林_47',
                 '芝山': '芝山_48',
                 '明德': '明德_49',
                 '石牌': '石牌_50',
                 '唭哩岸': '唭哩岸_51',
                 '奇岩': '奇岩_52',
                 '北投': '北投_53',
                 '新北投': '新北投_54',
                 '復興崗': '復興崗_55',
                 '忠義': '忠義_56',
                 '關渡': '關渡_57',
                 '竹圍': '竹圍_58',
                 '紅樹林': '紅樹林_59',
                 '淡水': '淡水_60',
                 '頂埔': '頂埔_61',
                 '永寧': '永寧_62',
                 '土城': '土城_63',
                 '海山': '海山_64',
                 '亞東醫院': '亞東醫院_65',
                 '府中': '府中_66',
                 '板橋': '板橋_67',
                 '新埔': '新埔_68',
                 '江子翠': '江子翠_69',
                 '龍山寺': '龍山寺_70',
                 '西門': '西門_71',
                 '善導寺': '善導寺_72',
                 '忠孝新生': '忠孝新生_73',
                 '忠孝敦化': '忠孝敦化_74',
                 '國父紀念館': '國父紀念館_75',
                 '市政府': '市政府_76',
                 '永春': '永春_77',
                 '後山埤': '後山埤_78',
                 '昆陽': '昆陽_79',
                 '南港': '南港_80',
                 '象山': '象山_81',
                 '台北101/世貿': '台北101/世貿_82',
                 '信義安和': '信義安和_83',
                 '大安森林公園': '大安森林公園_84',
                 '北門': '北門_85',
                 '松江南京': '松江南京_86',
                 '台北小巨蛋': '台北小巨蛋_87',
                 '南京三民': '南京三民_88',
                 '松山': '松山_89',
                 '輔大': '輔大_90',
                 '新莊': '新莊_91',
                 '頭前庄': '頭前庄_92',
                 '先嗇宮': '先嗇宮_93',
                 '三重': '三重_94',
                 '菜寮': '菜寮_95',
                 '台北橋': '台北橋_96',
                 '大橋頭站': '大橋頭站_97',
                 '中山國小': '中山國小_98',
                 '行天宮': '行天宮_99',
                 '東門': '東門_100',
                 '蘆洲': '蘆洲_101',
                 '三民高中': '三民高中_102',
                 '徐匯中學': '徐匯中學_103',
                 '三和國中': '三和國中_104',
                 '三重國小': '三重國小_105',
                 '迴龍': '迴龍_106',
                 '丹鳳': '丹鳳_107'}

In [3]:
def combine_files():
    """
    Returns a pandas Dataframe object containing all Taipei subway data in the
    /Data folder.
    
    Warning: make sure the current directory has the /Data folder
    """
    
    #creates a list object containg strings that match all the files name
    csv_ = []
    for i in range(201701, 201713):
        csv_.append(i)
    for i in range(201801, 201813):
        csv_.append(i)
    for i in range(201901, 201903):
        csv_.append(i)
    csv_files = []
    for i in csv_:
        i = str(i) + '.csv'
        csv_files.append(i)
    
    #Reads individual files and Concats them into one Dataframe
    csv_list = []
    for csv in csv_files:
        print('reading ' + csv)
        df = pd.read_csv('Data/' + csv, skiprows=[1], header=0, 
                         sep="\s+", error_bad_lines=False, low_memory=False)
        df = df.iloc[:-1, :]
        
        
        df = dd.from_pandas(df, npartitions=10)
        csv_list.append(df)
    print('Concating files...')
    mta = dd.concat(csv_list, axis=0, interleave_partitions=True)
    
    #Changes Chinese columns to English
    mta.columns = ['Date', 'Time', 'Entry', 'Exit', 'Rides']
    
#     #Appends each Chinese station names a unique number for readability

#     print('Add Numbers to Station Names...')
#     replace = mta.map_partitions(pd.DataFrame.replace, 
#                                  {'Entry': station_dict, "Exit": station_dict}, 
#                                  inplace=True)
#     with ProgressBar():
#         out = replace.compute()
#     print('Done!')
    
    return mta

In [4]:
taipei = combine_files()

reading 201701.csv


b'Skipping line 7581603: expected 5 fields, saw 6\n'


reading 201702.csv


b'Skipping line 6858435: expected 5 fields, saw 6\n'


reading 201703.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201704.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201705.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201706.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201707.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201708.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201709.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201710.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201711.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201712.csv


b'Skipping line 7639923: expected 5 fields, saw 6\n'


reading 201801.csv


b'Skipping line 7581603: expected 5 fields, saw 6\n'


reading 201802.csv


b'Skipping line 6858435: expected 5 fields, saw 6\n'


reading 201803.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201804.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201805.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201806.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201807.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201808.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201809.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201810.csv


b'Skipping line 7593267: expected 5 fields, saw 6\n'


reading 201811.csv


b'Skipping line 7348323: expected 5 fields, saw 6\n'


reading 201812.csv


b'Skipping line 7639923: expected 5 fields, saw 6\n'


reading 201901.csv


b'Skipping line 7581603: expected 5 fields, saw 6\n'


reading 201902.csv


b'Skipping line 6858435: expected 5 fields, saw 6\n'


Concating files...


In [5]:
taipei.describe().compute()  

Unnamed: 0,Rides,0
count,193319100.0,
mean,8.45242,
std,23.29365,
min,0.0,
25%,,1.0
50%,,3.0
75%,,17.0
max,5737.0,


In [6]:
taipei.columns

Index(['Date', 'Time', 'Entry', 'Exit', 'Rides'], dtype='object')

In [10]:
taipei[taipei['Rides'] == 5737].compute()

Unnamed: 0,Date,Time,Entry,Exit,Rides
7574446,2017-12-31,22,台北車站,台北101/世貿,5737.0


In [15]:
station_dict.keys()

AttributeError: 'dict_keys' object has no attribute 'tolist'

In [16]:
number = []
for i in range(0, 108):
    number.append(i)

In [18]:
sta = dict(zip(station_dict.keys(), number))

In [20]:
taipei['nEntry'] = taipei['Entry'].map(sta)

In [22]:
taipei['nExit'] = taipei['Exit'].map(sta)

In [23]:
taipei.head()

Unnamed: 0,Date,Time,Entry,Exit,Rides,nEntry,nExit
0,2017-01-01,0,松山機場,松山機場,0.0,0,0
1,2017-01-01,0,松山機場,中山國中,0.0,0,1
2,2017-01-01,0,松山機場,南京復興,0.0,0,2
3,2017-01-01,0,松山機場,忠孝復興,0.0,0,3
4,2017-01-01,0,松山機場,大安,0.0,0,4


In [28]:
dd.to_datetime(taipei['Date'], infer_datetime_format=True).compute()

KeyboardInterrupt: 

In [37]:
taipei.to_csv('Data/total/taipei*.csv')

['Data/total/taipei00.csv',
 'Data/total/taipei01.csv',
 'Data/total/taipei02.csv',
 'Data/total/taipei03.csv',
 'Data/total/taipei04.csv',
 'Data/total/taipei05.csv',
 'Data/total/taipei06.csv',
 'Data/total/taipei07.csv',
 'Data/total/taipei08.csv',
 'Data/total/taipei09.csv',
 'Data/total/taipei10.csv',
 'Data/total/taipei11.csv',
 'Data/total/taipei12.csv',
 'Data/total/taipei13.csv',
 'Data/total/taipei14.csv',
 'Data/total/taipei15.csv',
 'Data/total/taipei16.csv',
 'Data/total/taipei17.csv',
 'Data/total/taipei18.csv',
 'Data/total/taipei19.csv',
 'Data/total/taipei20.csv',
 'Data/total/taipei21.csv',
 'Data/total/taipei22.csv',
 'Data/total/taipei23.csv',
 'Data/total/taipei24.csv',
 'Data/total/taipei25.csv',
 'Data/total/taipei26.csv',
 'Data/total/taipei27.csv',
 'Data/total/taipei28.csv',
 'Data/total/taipei29.csv',
 'Data/total/taipei30.csv',
 'Data/total/taipei31.csv',
 'Data/total/taipei32.csv',
 'Data/total/taipei33.csv',
 'Data/total/taipei34.csv',
 'Data/total/taipei3

In [47]:
tp = dd.read_csv('Data/total/taipei*.csv', parse_dates=['Date'])

In [48]:
tp.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,Entry,Exit,Rides,nEntry,nExit
0,0,2017-01-01,0,松山機場,松山機場,0.0,0,0
1,1,2017-01-01,0,松山機場,中山國中,0.0,0,1
2,2,2017-01-01,0,松山機場,南京復興,0.0,0,2
3,3,2017-01-01,0,松山機場,忠孝復興,0.0,0,3
4,4,2017-01-01,0,松山機場,大安,0.0,0,4


In [49]:
tp.dtypes

Unnamed: 0             int64
Date          datetime64[ns]
Time                   int64
Entry                 object
Exit                  object
Rides                float64
nEntry                 int64
nExit                  int64
dtype: object