In [70]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import datetime
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [71]:
def get_month_name(argument):
    switcher = {
        1: "January",
        2: "February",
        3: "March",
        4: "April",
        5: "May",
        6: "June",
        7: "July",
        8: "August",
        9: "September",
        10: "October",
        11: "November",
        12: "December"
    }
    return switcher.get(argument, "Invalid month")

In [72]:
def check_missing_vals_and_process(data):
    print("Missing Values, since there is no missing values, not printing those")
    #print(data.isnull().sum())
    return data

In [73]:
def get_code_for_time(dateTime, consider30Min=True):
      hour = dateTime.hour
      code = 1
      if hour != 0:
        code = (hour * 2) + 1
      if consider30Min and dateTime.minute >= 30:
        code = code + 1
      return code

def add_date_info(data):
    print("Date Conversion has been started")
    data['pickup_datetime'] = pd.to_datetime(data.pickup_datetime)
    #data['drop_datetime'] = pd.to_datetime(data.drop_datetime)
    data['weekday'] = data.pickup_datetime.dt.day_name()
    data['pickup_time'] = data.pickup_datetime.dt.time
    #data['dropoff_time'] = data.drop_datetime.dt.time
    data['pickup_date'] = data.pickup_datetime.dt.date
    #data['drop_date'] = data.drop_datetime.dt.date
    data['month'] = data.pickup_datetime.dt.month
    data['pickup_time_code'] = data['pickup_datetime'].apply(get_code_for_time)
    #data['dropoff_time_code'] = data['drop_datetime'].apply(get_code_for_time)
    # Finding Trip DUration by deducting pickup time from Drop Time
    data['trip_duration'] = (pd.to_datetime(data.drop_datetime) - data['pickup_datetime']).dt.total_seconds() / 60


In [74]:
def last_day_of_month(any_day):
    # this will never fail
    # get close to the end of the month for any day, and add 4 days 'over'
    next_month = any_day.replace(day=28) + datetime.timedelta(days=4)
    # subtract the number of remaining 'overage' days to get last day of current month, 
    # or said programattically said, the previous day of the first of next month
    return next_month - datetime.timedelta(days=next_month.day)

def remove_ambiguous_dates(data, month, year):
    print("Removing ambiguous dates.")
    olen = data.shape[0]
    ambg_dates = data[(data['pickup_date'] < datetime.date(year, month, 1)) | 
                      (data['pickup_date'] > last_day_of_month(datetime.date(year, month, 1)))]
    print("Ambiguous dates for month ", get_month_name(month)," year ", year , " is ", len(ambg_dates.index))
    data.drop(ambg_dates.index, axis='rows', inplace=True) # drop dates out of range
    print("Number of rows dropped: {}".format(olen - data.shape[0]))

In [75]:
def removing_trip_duration_is_less_than_eq_0(data):
    # removing negative values
    print("Removing Trip duration is less than or equal to 0")
    olen = data.shape[0]
    data.drop(data[data['trip_duration'] < 0].index, inplace=True)
    print("Number of rows dropped: {}".format(olen - data.shape[0]))

def show_trip_duration_in_per(data):
    print("Trip duration in percentile:")
    for i in np.arange(99,100,0.1):
        var = data["trip_duration"].values
        var = np.sort(var,axis = None)
        print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
    print ("100 percentile value is ",var[-1])
    
def remove_rec_based_trip_duration_per(data, month, year):
    # looking further than 99.5 percentiles
    #suddenIncrease = calculate_sudden_increase_trip_duration(month, year)
    print("Removing outliers, as TLC regulation max trip duration can be 720 minutes")
    #print("Also there is sudden increase after {} minutes".format(suddenIncrease))
    olen = data.shape[0]
    data.drop(data[data['trip_duration'] > 720].index, inplace=True) 
    print('Number of rows dropped: {}'.format(olen - data.shape[0]))
    
def remove_rec_based_on_trip_duration(data, month, year):
    #show_trip_duration_in_per(data)
    removing_trip_duration_is_less_than_eq_0(data)
    #remove_rec_based_trip_duration_per(data, month, year)
    

In [76]:
def removing_trip_distance_is_less_than_eq_0(data):
    # removing negative values
    print("Removing Trip distance is less than or equal to 0")
    olen = data.shape[0]
    data.drop(data[data['trip_distance'] < 0].index, inplace=True)
    print("Number of rows dropped: {}".format(olen - data.shape[0]))

def show_trip_distance_in_per(data):
    lis=[]
    for i in np.arange(99.9999,99.99999,0.00001):
        var = data["trip_distance"].values
        var = np.sort(var,axis = None)
        inc = var[int(len(var)*(float(i)/100))]
        lis.append(inc)
        print("{} percentile value is {}".format(i,inc))
    print ("100 percentile value is ",var[-1])
    lis.append(var[-1])

def remove_rec_based_trip_distance_per(data, month, year):
    # looking further than 99.5 percentiles
    suddenIncrease = calculate_trip_distance_threshold(month, year)
    print("Removing outliers, large distance beyond threshold {}".format(suddenIncrease))
    olen = data.shape[0]
    data.drop(data[data['trip_distance'] > suddenIncrease].index, inplace=True) 
    print('Number of rows dropped: {}'.format(olen - data.shape[0]))

def remove_trip_distance_id_lower_than_avg(data, month, year):
    removing_trip_distance_is_less_than_eq_0(data)
    #show_trip_distance_in_per(data)
    #remove_rec_based_trip_distance_per(data, month, year)
    

In [77]:
def preprocess_data(data, month, year):
    #Keep required columns
    req_data = data[['tpep_pickup_datetime','tpep_dropoff_datetime', 'trip_distance', 
                     'PULocationID', 'DOLocationID']]
    #Rename columns
    req_data.rename(columns={'tpep_pickup_datetime':'pickup_datetime', 
                             'tpep_dropoff_datetime':'drop_datetime',  
                             'PULocationID':'PULID',
                             'DOLocationID':'DOLID'}, 
                    inplace=True) # rename columns
    req_data = check_missing_vals_and_process(req_data)
    add_date_info(req_data)
    #Removing no more required columns as we have converted it into required columns
    remove_ambiguous_dates(req_data, month, year)
    remove_rec_based_on_trip_duration(req_data, month, year)
    remove_trip_distance_id_lower_than_avg(req_data, month, year)
    return req_data

In [90]:
def add_weather_data(yellow_car_data, month, year):
    print("Calculating weather data")
    weather_dir_path = os.path.join(os.path.dirname(notebook_path), 'weather_data')
    file_path = os.path.join(weather_dir_path, 'wwo_{}_{}.csv'.format(year, month))
    wwo_data = pd.read_csv(file_path)
    wwo_data['date'] = pd.to_datetime(wwo_data.date).dt.date
    
    def filter_row_from_wwo(pickup_date, pickup_time_code):
        return wwo_data[(wwo_data['date'] == pickup_date) & (wwo_data['time_code'] == pickup_time_code)]

    def get_weather_data(pickup_datetime):
        wwo_row = filter_row_from_wwo(pickup_datetime.date, get_code_for_time(pickup_datetime, False))
        if wwo_row.empty:
           return 0,0,0
        else:
            return wwo_row.iloc[0]['totalSnow_cm'], wwo_row.iloc[0]['FeelsLikeC'], wwo_row.iloc[0]['precipMM']
    yellow_car_data['totalSnow_cm'],yellow_car_data['FeelsLikeC'],yellow_car_data['precipMM'] = yellow_car_data['pickup_datetime'].apply(get_weather_data)
    
    

In [91]:
def process_csv_file(filePath, month, year):
    print("**************************Processing Month({})**************************".format(get_month_name(month)))
    print("Processing:", filePath)
    #Load dataset
    data = pd.read_csv(filePath)
    data = preprocess_data(data, month, year)
    add_weather_data(data, month, year)
    data.drop(['pickup_datetime','drop_datetime'], inplace=True, axis=1)
    print("{} month's total records: {}".format(get_month_name(month), data.shape[0]))
    data.to_csv("{}_{}.csv".format(year, month))

In [92]:
def load_data_for_year(dir_path, year):
    [ process_csv_file(os.path.join(dir_path, str(month) + ".csv"), month, year) for month in range(1,2) ]
    

In [93]:
notebook_path = os.path.abspath("taxi_demand_prediction_pre.ipynb")
for year in range(2018, 2019):
    print()
    print("============================Processing Year({})============================".format(year))
    load_data_for_year(os.path.join(os.path.dirname(notebook_path), str(year)), year)


**************************Processing Month(January)**************************
Processing: /Users/archanapatil890/Documents/Machine Learning/python/Capstone/2018/1.csv
Missing Values, since there is no missing values, not printing those
Date Conversion has been started
Removing ambiguous dates.
Ambiguous dates for month  January  year  2018  is  342
Number of rows dropped: 342
Removing Trip duration is less than or equal to 0
Number of rows dropped: 2
Removing Trip distance is less than or equal to 0
Number of rows dropped: 0
Calculating weather data


KeyboardInterrupt: 