# STAGE 3a: PROCESS - Save cleaned CSV files

Loop Through csv files and Save the cleaned csv in./data/cleaned_csv

This notebook is to run the code from Clean by month. We will iterate through the file list stored in <font color ='red'> file_list_2020.csv</font>

In [1]:
import pandas as pd
from scipy import stats
from scipy.stats import zscore
import datetime

file_list_df = pd.read_csv('file_list_2020.csv', header=None, names= ['filename'])
file_list = file_list_df['filename'].values

dtypes = {'ride_id': 'str', 'rideable_type': 'category', 'start_station_name': 'category', 'start_station_id': 'category', 'end_station_name':'category',
           'end_station_id': 'category', 'member_casual':'category'}

# this is a function to read the trip files, keep in mind I hard coded the dates columns
def read_csv_to_df(filename, dtype):
    df = pd.read_csv('./Data/csv/'+filename, parse_dates=['started_at','ended_at'], dtype = dtype)

    return df

# # this file is a random choice
#filename = file_list[3]
## Here we make a loop with the file_list
for filename in file_list[3:]:
    
    df = read_csv_to_df(filename, dtypes)
    
    ### Drop rows with null data

    rows_to_delete_na = df[df.isnull().any(axis=1)==True].index
    #print (filename, ' rows_to_delete_na',len(rows_to_delete_na))

    df = df[~df.index.isin(rows_to_delete_na)]


    ### Drop rows with time end less than time start

    rows_to_delete_timenegative = df[(df['ended_at']<df['started_at'])].index
    #print (filename, 'rows_to_delete_timenegative',len(rows_to_delete_timenegative))

    df = df[~df.index.isin(rows_to_delete_timenegative)]


    ## Observing lat long consistency per station id
    ## Check latitude numbers associated with each station id.

    
    # first concatenate start and stop stations to check outliers together
    df_lat_eval_start_station = df[['start_station_id','start_lat']].rename(columns={"start_station_id": "station_id", "start_lat": "station_lat"})
    df_lat_eval_end_station = df[['end_station_id','end_lat']].rename(columns={"end_station_id": "station_id", "end_lat": "station_lat"})
    df_lat_eval = pd.concat([df_lat_eval_start_station,df_lat_eval_end_station],axis=0)
    df_lat_eval.reset_index(inplace=True)
    df_lat_eval.columns = ['index_orig', 'station_id', 'station_lat']


    # try to find outliers for latitude number for each station id# delete rows with latitude number that is not consistent with the station id

    # the number of samples should be greater than 30 for relevant zscore calculation
    c = df_lat_eval.groupby(['station_id'])['station_id'].count()
    few_samples_station_ids= c[c<30].index
    df_lat_eval.drop(df_lat_eval[df_lat_eval['station_id'].isin(few_samples_station_ids)].index, inplace=True)

    # calculate z score using groupby station_id

    df_zscore_lat = df_lat_eval.groupby(['station_id'])['station_lat'].transform(lambda x : zscore(x,ddof=1))
    df_zscore_lat.name = 'zscore'

    df_lat_eval_zscore = pd.concat([df_lat_eval,df_zscore_lat],axis=1)
    rows_to_delete_lat = df_lat_eval_zscore[(df_lat_eval_zscore['zscore']<-3)|(df_lat_eval_zscore['zscore']>3)]
    rows_to_delete_list_lat=list(rows_to_delete_lat['index_orig'])

    # use the same code to run on longitude

    df_lng_eval_start_station = df[['start_station_id','start_lng']].rename(columns={"start_station_id": "station_id", "start_lng": "station_lng"})
    df_lng_eval_end_station = df[['end_station_id','end_lng']].rename(columns={"end_station_id": "station_id", "end_lng": "station_lng"})
    df_lng_eval_start_station['start_end'] = 'start'
    df_lng_eval_end_station['start_end'] = 'end'
    df_lng_eval = pd.concat([df_lng_eval_start_station,df_lng_eval_end_station],axis=0)
    df_lng_eval.reset_index(inplace=True)
    df_lng_eval.columns = ['index_orig', 'station_id', 'station_lng', 'start_end']

    c = df_lng_eval.groupby(['station_id'])['station_id'].count()
    few_samples_station_ids= c[c<30].index
    few_samples_station_ids # exclude these station ids
    df_lng_eval.drop(df_lng_eval[df_lng_eval['station_id'].isin(few_samples_station_ids)].index, inplace=True)

    df_zscore_lng = df_lng_eval.groupby(['station_id'])['station_lng'].transform(lambda x : zscore(x,ddof=1))
    df_zscore_lng.name = 'zscore'
    df_lng_eval_zscore = pd.concat([df_lng_eval,df_zscore_lng],axis=1)
    rows_to_delete_lng = df_lng_eval_zscore[(df_lng_eval_zscore['zscore']<-3)|(df_lng_eval_zscore['zscore']>3)]

    rows_to_delete_list_lng=list(rows_to_delete_lng['index_orig'])

    ### It is very possible the station names with (Temp) are for testing only
    # The rows that contain station name with Temp ought to be deleted

    rows_to_delete_temp = list(df[df['start_station_name'].str.contains("Temp")].index)

    ### Now do the final clean up by deleting rows with lat/lng outlier, and with temp stations

    rows_to_drop = list(set(rows_to_delete_list_lat) | set(rows_to_delete_list_lng) | set(rows_to_delete_temp) )
    df = df[~df.index.isin(rows_to_drop)]

    ### Calculate trip duration and examine outliers



    duration  = (df['ended_at']-df['started_at']).apply(datetime.timedelta.total_seconds)

    duration.name ='duration'

    duration_zscore_calc = ((duration - duration.mean())/duration.std())

    duration_outliers = duration[(duration_zscore_calc>3) | (duration_zscore_calc<-3)]

    rows_to_delete_duration = duration_outliers.index

    df = df[~df.index.isin(rows_to_delete_duration)]


    ### Collect all the index of all the deleted rows and output as csv for record

    rows_deleted = list(set(rows_to_delete_na) |\
                        set(rows_to_delete_timenegative)|\
                        set(rows_to_delete_duration)|\
                        set(rows_to_delete_list_lat) | \
                        set(rows_to_delete_list_lng) | \
                        set(rows_to_delete_temp) ) 

    file_dest = './data/deleted_rows/row_deleted_for_'+filename

    pd.DataFrame(rows_deleted).to_csv(file_dest)

    df.to_csv('./data/cleaned_csv/'+filename)

