In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from zipfile import ZipFile

with ZipFile('/kaggle/input/nyc-taxi-trip-duration/train.zip') as file:
    file.extractall()
    
with ZipFile('/kaggle/input/nyc-taxi-trip-duration/test.zip') as file:
    file.extractall()

with ZipFile('/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip') as file:
    file.extractall()

# Load data

In [None]:
train = pd.read_csv('./train.csv')
train.info()
train.head()

In [None]:
test = pd.read_csv('./test.csv')
test.info()
test.head()

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission.info()
sample_submission.head()

# Clean data

In [None]:
print(train.isnull().sum())

In [None]:
print(test.isnull().sum())

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
train.describe()

In [None]:
#Visualizacion de valores atipicos
plt.subplots(figsize=(18,6))
plt.title("Visualizacion de valores atipicos")
train.boxplot();

In [None]:
#dejar solo viajes con pasajeros
train = train[(train.passenger_count > 0)]
train.passenger_count.describe()

In [None]:
#limpiar viajes con duracion muy alta
train = train[(train.trip_duration < 6000)]
train.trip_duration.describe()

# Categorizacion

In [None]:
#utilizacion de encoder para asignar un valor numerico a la columna store_and_fwd_flag
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
encoderTest = OrdinalEncoder()

encoder.fit(train[["store_and_fwd_flag"]])
encoderTest.fit(test[["store_and_fwd_flag"]])

train[["store_and_fwd_flag"]] = encoder.transform(train[["store_and_fwd_flag"]])
test[["store_and_fwd_flag"]] = encoderTest.transform(test[["store_and_fwd_flag"]])

In [None]:
# convetir las celdas de fechas para manipular los datos y crear valores nuevos
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)

test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)

#Creacion de valores nuevos
train['month'] = train.pickup_datetime.dt.month
train['week'] = train.pickup_datetime.dt.isocalendar().week
train['weekday'] = train.pickup_datetime.dt.weekday
train['hour'] = train.pickup_datetime.dt.hour
train['minute'] = train.pickup_datetime.dt.minute
train['minute_oftheday'] = train['hour'] * 60 + train['minute']

test['month'] = test.pickup_datetime.dt.month
test['week'] = test.pickup_datetime.dt.isocalendar().week
test['weekday'] = test.pickup_datetime.dt.weekday
test['hour'] = test.pickup_datetime.dt.hour
test['minute'] = test.pickup_datetime.dt.minute
test['minute_oftheday'] = test['hour'] * 60 + test['minute']

train


# Distribucion de los datos

In [None]:
plt.subplots(figsize=(18,6))
plt.hist(train['trip_duration'].values, bins=100)
plt.xlabel('Trip duration')
plt.ylabel('Number of records')
plt.show()

# Matriz de correlacion

In [None]:
corr = train.corr()
corr

In [None]:
corr.style.background_gradient(cmap='plasma').set_precision(2)

In [None]:
corr["trip_duration"].sort_values(ascending=False)

### **segun la matriz de correlacion se puede observar una correlacion no lineal

In [None]:
#trabajar los datos de longitud y latitud para ver si existe una correlacion con la duracion
#Calculo de distancias apartir de coordenadas 
def ft_haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371 #km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

#Agregar la distancia
train['distance'] = ft_haversine_distance(train['pickup_latitude'].values,
                                                 train['pickup_longitude'].values, 
                                                 train['dropoff_latitude'].values,
                                                 train['dropoff_longitude'].values)
test['distance'] = ft_haversine_distance(test['pickup_latitude'].values, 
                                                test['pickup_longitude'].values, 
                                                test['dropoff_latitude'].values, 
                                                test['dropoff_longitude'].values)

#Limpiar la distancia
train = train[(train.distance < 80)]

## Verificar correlacion nuevamente

In [None]:
corr = train.corr()
corr

In [None]:
corr.style.background_gradient(cmap='plasma').set_precision(2)

# Modelo scikitlearn para la prediccion

## Train/Test Split

In [None]:
X = np.array(train["distance"]).reshape(-1,1)
Y = np.array(train["trip_duration"]).reshape(-1,1)

# aqui utilizamos la funcion train_test_split de sklearn para seleccionar los sets.
# training (70%) y test (30%)
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [None]:
# vamos a crear una figura con 2 slots con matplotlib
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# ax1 es el primer grafico del set de entrenamiento (Azul)
ax1.scatter(x_train,y_train, marker = ".", s = 60, c = "blue")

# ax2 es el segundo grafico del set de pruebas (Rojo)
ax2.scatter(x_test,y_test, marker = ".", s = 60, c = "red")

fig.text(0.5, 0.04, 'Duracion', ha='center')
fig.text(0.09, 0.5, 'Tiempo del viaje', va='center', rotation='vertical')

plt.show()

# Estimacion de coeficientes

In [None]:
# Estimacion de Coeficientes train
print("Train")
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(x_train, y_train)
print("score:", model.score(x_train, y_train))

b_1 = model.coef_[0]
b_0 = model.intercept_
rss = np.sum((y_train - b_0 - b_1 * x_train)**2)

print("b0:",b_0, "b1:", b_1, "rss:", rss)

In [None]:
# Estimacion de Coeficientes test
print("Test")
from sklearn.linear_model import LinearRegression

model_test = LinearRegression().fit(x_test, y_test)
print("score:", model_test.score(x_test, y_test))

b_1_test = model_test.coef_[0]
b_0_test = model_test.intercept_
rss_test = np.sum((y_test - b_0_test - b_1_test * x_test)**2)

print("b0:",b_0_test, "b1:", b_1_test, "rss:", rss_test)

# MSE

In [None]:
# Estimacion del MSE

N = len(x_train)
mse =  (1/N) * rss

print("MSE train:", mse)

# Predicciones

In [None]:
# Prediccion de test
y_prima_train = model.predict(x_train)
y_prima_test = model_test.predict(x_test)


#for x, pred in zip(x_test, y_prima_test):
#    print(f"f({x}) = {pred}")