In [2]:
# Importación de librerías necesarias
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Configuración de estilo para gráficos
plt.style.use('ggplot')


In [3]:
# Cargar los datos desde los archivos CSV
file_2015 = "/mnt/data/yellow_tripdata_201501_reducido.csv"
file_2016 = "/mnt/data/yellow_tripdata_201601_reducido.csv"

# Cargar los datasets
df_2015 = pd.read_csv(file_2015)
df_2016 = pd.read_csv(file_2016)

# Mostrar las primeras filas de cada dataset
df_2015.head(), df_2016.head()


In [4]:
df_2015.rename(columns={'RateCodeID': 'RatecodeID'}, inplace=True)

In [5]:
df_2015['tpep_pickup_datetime'] = pd.to_datetime(df_2015['tpep_pickup_datetime'])
df_2015['tpep_dropoff_datetime'] = pd.to_datetime(df_2015['tpep_dropoff_datetime'])
df_2016['tpep_pickup_datetime'] = pd.to_datetime(df_2016['tpep_pickup_datetime'])
df_2016['tpep_dropoff_datetime'] = pd.to_datetime(df_2016['tpep_dropoff_datetime'])

In [6]:
df_final = pd.concat([df_2015, df_2016], ignore_index=True)

In [7]:
df_final['trip_duration'] = (df_final['tpep_dropoff_datetime'] - df_final['tpep_pickup_datetime']).dt.total_seconds() / 60
df_final['speed_mph'] = (df_final['trip_distance'] / df_final['trip_duration']) * 60
df_final['is_weekend'] = df_final['tpep_pickup_datetime'].dt.weekday.isin([5, 6]).astype(int)
df_final['rush_hour'] = df_final['tpep_pickup_datetime'].dt.hour.isin([7, 8, 9, 16, 17, 18]).astype(int)
df_final['fare_per_mile'] = df_final['fare_amount'] / df_final['trip_distance']

In [13]:
df_final["speed_mph"] = df_final["speed_mph"].fillna(df_final["speed_mph"].median())
df_final["fare_per_mile"] = df_final["fare_per_mile"].fillna(df_final["fare_per_mile"].median())

In [9]:
numeric_cols = df_final.select_dtypes(include=['float', 'int']).columns
df_final[numeric_cols] = df_final[numeric_cols].round(2)

In [10]:
df_final["pickup_date"] = df_final["tpep_pickup_datetime"].dt.date
df_final.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_duration,speed_mph,is_weekend,rush_hour,fare_per_mile,pickup_date
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.99,40.75,1,N,-73.97,...,3.25,0.0,0.3,17.05,18.05,5.29,0,0,7.55,2015-01-15
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.0,40.72,1,N,-73.99,...,2.0,0.0,0.3,17.8,19.83,9.98,1,0,4.39,2015-01-10
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.96,40.8,1,N,-73.95,...,0.0,0.0,0.3,10.8,10.05,10.75,1,0,5.28,2015-01-10
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.01,40.71,1,N,-74.0,...,0.0,0.0,0.3,4.8,1.87,16.07,1,0,7.0,2015-01-10
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.97,40.76,1,N,-74.0,...,0.0,0.0,0.3,16.3,19.32,9.32,1,0,5.0,2015-01-10


In [14]:
df_final.to_csv('final_taxi_data.csv', index=False)