
# Accessing KDC Data from Python    

### Main Questions: Does the proximity of taxi ride pickup locations to tourist locations effect total ride cost?
### 1.) Import Taxi Data from KDC
### 2.) Reformat Data to Answer Question
### 3.) Run a Linear Regression


### First we can load the necessary python packages/libraries to run our code. 

In [1]:
import os
import pyodbc
#import textwrap
from getpass import getpass
#import pandas as pd
#import numpy as np
#import statsmodels.formula.api as smf
#import statsmodels.api as sm
#from geopy.distance import distance
#from geopy.geocoders import Nominatim
#import matplotlib.pyplot as plt
#import warnings
#warnings.filterwarnings("ignore", category=DeprecationWarning)

#%matplotlib inline

try:
    os.environ['ODBCSYSINI']
except KeyError:
    os.environ['ODBCSYSINI']='/home/awc6034/.odbc'

### You will need to install some of these packages on KLC.  You can do so with the following command:

In [None]:
pip install --user <package name>

### Next, please enter your KDC credentials here.

In [2]:
USER = 'kellogg\\awc6034'
PASSWORD = getpass('NetID password: ')

NetID password: ········


### Finally, we can establish a KDC connection.

In [3]:
conn = pyodbc.connect('DSN=kdc-tds;Database=TAXI_NYC;UID='+USER+';PWD='+PASSWORD)
cursor = conn.cursor()

### Now we can count the number of observations in both Taxi data tables. 

In [4]:
cursor.execute("SELECT COUNT(*) FROM TAXI_NYC.dbo.SRC_FareData")
taxi_fare_count = cursor.fetchone()[0]
print(taxi_fare_count)

cursor.execute("SELECT COUNT(*) FROM TAXI_NYC.dbo.SRC_TripData")
taxi_trip_count = cursor.fetchone()[0]
print(taxi_trip_count)

173179759
173179759


### PLEASE FEEL FREE TO REVIEW THE REMAINDER OF THIS NOTEBOOK LATER.

In [None]:
# Preview the first 1000 Observations

cursor.execute("SELECT TOP 1000 * FROM TAXI_NYC.dbo.SRC_FareData")
taxi_fare = cursor.fetchall()
cursor.execute("SELECT TOP 1000 * FROM TAXI_NYC.dbo.SRC_TripData")
taxi_trip = cursor.fetchall()

In [None]:
# Join the Fare and Trip Datasets
sql = textwrap.dedent("""
  SELECT TOP 1000 * FROM TAXI_NYC.dbo.SRC_TripData as Trip
    LEFT OUTER JOIN TAXI_NYC.dbo.SRC_FareData as Fare
    ON (Trip.hack_license = Fare.[ hack_license])
    AND (Trip.medallion = Fare.medallion)
    AND (Trip.vendor_id = Fare.[ vendor_id])
    AND (Trip.pickup_datetime = Fare.[ pickup_datetime])
""")

In [None]:
#read results into pandas dataframe

taxi_df = pd.read_sql(sql,conn)

In [None]:
# rename pickup and dropoff columns
taxi_df = taxi_df.rename(index=str, columns={"pickup_datetime":"pickup", "dropoff_datetime":"dropoff"})

# remove extra spaces from column names
taxi_df = taxi_df.rename(columns=lambda x: x.strip())

# print column list
print(list(taxi_df))

In [None]:
# get geolocation for Nasdaq sign in Time Square NYC
geolocator = Nominatim()
location = geolocator.geocode('4 Times Square, NYC')
location_coord = (location.latitude,location.longitude)
print(location)
print(location_coord)

In [None]:
# create pickup_lat_long column
pickup_latitude = pd.to_numeric(taxi_df['pickup_latitude'])
pickup_longitude = pd.to_numeric(taxi_df['pickup_longitude'])
print('pickup_longitude')
print(pickup_longitude[0:15])

taxi_df['pickup_lat_long'] =  list(zip(pickup_latitude,pickup_longitude))

# create dropoff_lat_long column
dropoff_latitude = pd.to_numeric(taxi_df['dropoff_latitude'])
dropoff_longitude = pd.to_numeric(taxi_df['dropoff_longitude'])

taxi_df['dropoff_lat_long'] =  list(zip(dropoff_latitude,dropoff_longitude))

# get geographic trip distance
taxi_df['geo_trip_distance'] = taxi_df.apply(lambda x: distance(x['pickup_lat_long'],x['dropoff_lat_long']).miles, axis=1)
print('geo_trip_distance')
print(taxi_df['geo_trip_distance'][0:15])
print('trip_distance')
print(taxi_df['trip_distance'][0:15])

# calculate miles from pickup location to Times Square
taxi_df['miles_to_location'] = taxi_df['pickup_lat_long'].apply(lambda x: distance(location_coord, x).miles)
#print(taxi_df['miles_to_location'][0:15])

# remove errant distance data
taxi_df.loc[taxi_df['miles_to_location'] > 5000, 'miles_to_location'] = np.nan
#print(taxi_df['miles_to_location'][0:15])

# convert total_amount to float
taxi_df['total_amount'] = taxi_df['total_amount'].astype(float)

In [None]:
# Perform regression
lm = smf.ols(formula='total_amount ~ miles_to_location', data=taxi_df).fit()

In [None]:
# print the coefficients
print(lm.params)

# print rsquared
print(lm.rsquared)

In [None]:
# plot data and least squares for data
taxi_df.plot(kind='scatter', x='miles_to_location', y='total_amount')
X = taxi_df['miles_to_location']
Y = lm.predict(X)
plt.plot(X, Y, c='red', linewidth=2)

## To make this analysis meaningful you would need to examine trips with similar geospatial distance. 
## How would you limit the data to trips of a certain distance range?