## Make sure the data exists

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os.path
import matplotlib.pyplot as plt

if not os.path.isfile('./train.csv'):
    import zipfile
    with zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/train.zip', 'r') as zip_ref:
        zip_ref.extractall('./')
else:
    print('train.csv file ready')


## Load the data

In [None]:
data = pd.read_csv("./train.csv") 
data.head()

## Convert latitude and longitude to distance

In [None]:
if not os.path.isfile('./distances.csv'):
    from geopy.distance import geodesic

    distances = data.apply(
        lambda row: geodesic((row["pickup_latitude"], row["pickup_longitude"]), (row["dropoff_latitude"], row["dropoff_longitude"])).kilometers,
        axis=1
    )
    distances.rename("distance").to_csv('distances.csv')

else:
    distances_f = pd.read_csv("./distances.csv")
    distances = distances_f["distance"].rename("distance")
    
data["distance"] = distances

## Check the data

In [None]:
data.info()

## Convert pickup_datetime and dropoff_datetime

In [None]:
data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"])
data["dropoff_datetime"] = pd.to_datetime(data["dropoff_datetime"])
data["pickup_day"] = data["pickup_datetime"].dt.dayofweek
data["dropoff_day"] = data["dropoff_datetime"].dt.dayofweek
data["pickup_month"] = data["pickup_datetime"].dt.month
data["dropoff_month"] = data["dropoff_datetime"].dt.month
data["pickup_hour"] = data["pickup_datetime"].dt.hour
data["dropoff_hour"] = data["dropoff_datetime"].dt.hour
data = data.drop("pickup_datetime", axis=1)
data = data.drop("dropoff_datetime", axis=1)

## Convert store_and_fwd_flag

In [None]:
encoded_store_and_fwd_flag = pd.get_dummies(data['store_and_fwd_flag'])
data = data.join(encoded_store_and_fwd_flag)
data = data.drop(['store_and_fwd_flag'], axis=1)

In [None]:
data["trip_duration_min"] = data["trip_duration"] / 60

In [None]:
data

## Prepare data

In [None]:
trip_duration_labels = data["trip_duration"].copy()
# "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",  "id", "N", "Y", "vendor_id", "pickup_day", "dropoff_day", "pickup_month", "dropoff_month", "pickup_hour", "dropoff_hour"
prepared_data = data.drop(["id", "trip_duration", "N", "Y", "trip_duration_min"], axis=1)
prepared_data

In [None]:
# from scipy import stats
# z_scores = stats.zscore(data)
# z_scores.describe()

## 1. Separe el dataset en Train y Test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(prepared_data, trip_duration_labels, test_size=0.30, random_state=42)

## 2. Realice Análisis de los datos (EDA) con gráficos o tablas

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
ax1.scatter(data["distance"],data["trip_duration_min"], marker = ".", s = 60, c = "blue")
ax2.scatter(data["distance"],data["trip_duration_min"], marker = ".", s = 60, c = "red")
fig.text(0.5, 0.04, 'distance', ha='center')
fig.text(0.09, 0.5, 'duration', va='center', rotation='vertical')
plt.show()

## 3. Cual es la distribución de los datos

In [None]:
prepared_data.hist(bins=20, figsize=(20,15))
plt.show()

## 4. Grafique la matriz de correlación

In [None]:
corr = prepared_data.join(trip_duration_labels ).corr()
corr.style.background_gradient(cmap='plasma').set_precision(2)

## 5. Ajuste un modelo con scikitlearn para realizar la predicción

In [None]:
from sklearn.linear_model import LinearRegression
lrmodel = LinearRegression()
lrmodel.fit(X_train, y_train)

In [None]:
X = data.iloc[:5]
y = trip_duration_labels.iloc[:5]
y_pred = lrmodel.predict(prepared_data.iloc[:5])
print("Predictions:\n", list(y_pred))
print("Real Data:\n", list(y))

## 6. Muestre sus resultados

In [None]:
from sklearn.metrics import mean_squared_error
trip_predictions = lrmodel.predict(prepared_data)
mse = mean_squared_error(trip_duration_labels, trip_predictions)
rmse = np.sqrt(mse)
print(f"mse: {round(mse, 2)}, rmse {round(rmse, 2)}")