In [1]:
import requests
import zipfile
import pandas as pd
import os
from io import BytesIO

def download_and_read_specific_csv(url, csv_filename, extract_to='extracted_data', encoding='ISO-8859-1', delimiter=';'):
    # Step 1: Download the zip file
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download file from {url}")
    
    # Step 2: Extract the zip file
    with zipfile.ZipFile(BytesIO(response.content)) as thezip:
        thezip.extractall(extract_to)
    
    # Step 3: Read the specific CSV file
    csv_path = os.path.join(extract_to, csv_filename)
    if not os.path.exists(csv_path):
        raise Exception(f"{csv_filename} not found in the zip archive")
    
    try:
        df = pd.read_csv(csv_path, encoding=encoding, sep=";")
    except UnicodeDecodeError:
        print(f"Could not decode {csv_filename} with encoding {encoding}. Trying 'utf-8'.")
        df = pd.read_csv(csv_path, encoding='utf-8', sep=';', errors='ignore')

    return df

# Example usage
url = 'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/sbase/subte-viajes-molinetes/molinetes-2024.zip'
csv_filename = '202403_PAX15min-ABC.csv'
data = download_and_read_specific_csv(url, csv_filename)
data.head()


Unnamed: 0,FECHA;DESDE;HASTA;LINEA;MOLINETE;ESTACION;pax_pagos;pax_pases_pagos;pax_franq;pax_TOTAL
0,1/3/2024;05:15:00;05:30:00;LineaC;LineaC_Const...
1,1/3/2024;05:15:00;05:30:00;LineaB;LineaB_Alem_...
2,1/3/2024;05:15:00;05:30:00;LineaB;LineaB_Pelle...
3,1/3/2024;05:15:00;05:30:00;LineaB;LineaB_Alem_...
4,1/3/2024;05:15:00;05:30:00;LineaA;LineaA_SanPe...


In [2]:
cd extracted_data

/workspaces/Passenger-Flow-Prediction-CABA/extracted_data


In [3]:
import requests
import zipfile
import pandas as pd
import os
from io import BytesIO

def download_and_read_specific_csv(url, csv_filename, extract_to='extracted_data', encoding='ISO-8859-1', delimiter=';'):
    # Step 1: Download the zip file
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download file from {url}")
    
    # Step 2: Extract the zip file
    with zipfile.ZipFile(BytesIO(response.content)) as thezip:
        thezip.extractall(extract_to)
    
    # Step 3: Read the specific CSV file
    csv_path = os.path.join(extract_to, csv_filename)
    if not os.path.exists(csv_path):
        raise Exception(f"{csv_filename} not found in the zip archive")
    
    try:
        df = pd.read_csv(csv_path, encoding=encoding, sep=";")
        # Split the single column into multiple columns
        df = df[df.columns[0]].str.split(';', expand=True)
        # Rename the columns based on the provided header
        df.columns = ['FECHA', 'DESDE', 'HASTA', 'LINEA', 'MOLINETE', 'ESTACION', 'pax_pagos', 'pax_pases_pagos', 'pax_franq', 'pax_TOTAL']
        # Rename the columns to lowercase
        df.columns = df.columns.str.strip().str.lower()
    except UnicodeDecodeError:
        print(f"Could not decode {csv_filename} with encoding {encoding}. Trying 'utf-8'.")
        df = pd.read_csv(csv_path, encoding='utf-8', sep=delimiter, errors='ignore')


    return df

# Example usage
url = 'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/sbase/subte-viajes-molinetes/molinetes-2024.zip'
csv_filename = '202403_PAX15min-ABC.csv'
df = download_and_read_specific_csv(url, csv_filename)
df.head(10)

Unnamed: 0,fecha,desde,hasta,linea,molinete,estacion,pax_pagos,pax_pases_pagos,pax_franq,pax_total
0,1/3/2024,05:15:00,05:30:00,LineaC,LineaC_Constitucion_Turn16,Constitucion,9,0,3,12
1,1/3/2024,05:15:00,05:30:00,LineaB,LineaB_Alem_S_Turn01,Leandro N. Alem,13,0,0,13
2,1/3/2024,05:15:00,05:30:00,LineaB,LineaB_Pellegrini_E_Turn03,Carlos Pellegrini,0,0,1,1
3,1/3/2024,05:15:00,05:30:00,LineaB,LineaB_Alem_S_Turn04,Leandro N. Alem,7,0,2,9
4,1/3/2024,05:15:00,05:30:00,LineaA,LineaA_SanPedrito_Este_Turn05,San Pedrito,4,0,0,4
5,1/3/2024,05:15:00,05:30:00,LineaA,LineaA_PJunta_N_Turn03,Primera Junta,2,0,0,2
6,1/3/2024,05:15:00,05:30:00,LineaC,LineaC_Lavalle_S_Turn03,Lavalle,0,0,1,1
7,1/3/2024,05:15:00,05:30:00,LineaA,LineaA_CBarros_N_Turn03,Castro Barros,1,0,0,1
8,1/3/2024,05:15:00,05:30:00,LineaC,LineaC_Indepen_Turn02,Independencia,1,0,0,1
9,1/3/2024,05:15:00,05:30:00,LineaB,LineaB_Medrano_N_Turn03,Medrano,4,0,1,5


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


# Convert date and time columns to datetime
df['fecha'] = pd.to_datetime(df['fecha'])
df['start_time'] = pd.to_datetime(df['fecha'].astype(str) + ' ' + df['desde'])
df['end_time'] = pd.to_datetime(df['fecha'].astype(str) + ' ' + df['hasta'])
df.head(5)
df.isnull().sum()
# Fill missing values with the median
df.fillna(df.median(), inplace=True)

  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)


In [5]:
# Cap outliers at the 95th percentile
#for col in ['pax_pago', 'pax_pases_pagos', 'pax_franq', 'pax_total']:
#    df[col] = df[col].clip(upper=df[col].quantile(0.95))

In [6]:
# Extract time-related features
df['hour'] = df['start_time'].dt.hour
df['day_of_week'] = df['start_time'].dt.dayofweek
df['month'] = df['start_time'].dt.month
df

Unnamed: 0,fecha,desde,hasta,linea,molinete,estacion,pax_pagos,pax_pases_pagos,pax_franq,pax_total,start_time,end_time,hour,day_of_week,month
0,2024-01-03,05:15:00,05:30:00,LineaC,LineaC_Constitucion_Turn16,Constitucion,9,0,3,12,2024-01-03 05:15:00,2024-01-03 05:30:00,5,2,1
1,2024-01-03,05:15:00,05:30:00,LineaB,LineaB_Alem_S_Turn01,Leandro N. Alem,13,0,0,13,2024-01-03 05:15:00,2024-01-03 05:30:00,5,2,1
2,2024-01-03,05:15:00,05:30:00,LineaB,LineaB_Pellegrini_E_Turn03,Carlos Pellegrini,0,0,1,1,2024-01-03 05:15:00,2024-01-03 05:30:00,5,2,1
3,2024-01-03,05:15:00,05:30:00,LineaB,LineaB_Alem_S_Turn04,Leandro N. Alem,7,0,2,9,2024-01-03 05:15:00,2024-01-03 05:30:00,5,2,1
4,2024-01-03,05:15:00,05:30:00,LineaA,LineaA_SanPedrito_Este_Turn05,San Pedrito,4,0,0,4,2024-01-03 05:15:00,2024-01-03 05:30:00,5,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599339,2024-03-31,22:30:00,22:45:00,LineaA,LineaA_Congreso_N_Turn01,Congreso,0,0,3,3,2024-03-31 22:30:00,2024-03-31 22:45:00,22,6,3
599340,2024-03-31,22:30:00,22:45:00,LineaC,LineaC_Retiro_Turn11,Retiro,1,0,0,1,2024-03-31 22:30:00,2024-03-31 22:45:00,22,6,3
599341,2024-03-31,22:30:00,22:45:00,LineaB,LineaB_CallaoB_S_Turn01,Callao.B,3,0,0,3,2024-03-31 22:30:00,2024-03-31 22:45:00,22,6,3
599342,2024-03-31,22:30:00,22:45:00,LineaC,LineaC_Lavalle_S_Turn03,Lavalle,1,0,0,1,2024-03-31 22:30:00,2024-03-31 22:45:00,22,6,3


In [7]:
# Split data into features and target variable
df_c = df[df['linea']=='LineaC']
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder and transform the 'estacion' column
df_c['estacion_le'] = label_encoder.fit_transform(df_c['estacion'])
X = df_c[['estacion_le', 'hour', 'day_of_week', 'month']]
y = df_c['pax_total']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_c['estacion_le'] = label_encoder.fit_transform(df_c['estacion'])


Unnamed: 0,estacion_le,hour,day_of_week,month
0,1,5,2,1
6,5,5,2,1
8,4,5,2,1
10,0,5,2,1
17,1,5,2,1
24,7,5,2,1
26,1,5,2,1
28,8,5,2,1
29,3,5,2,1
35,1,5,2,1


In [8]:
df_c.month.max()

12

In [9]:
# Retrieve the encoding dictionary
encoding_dict = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
encoding_dict

{'Avenida de Mayo': 0,
 'Constitucion': 1,
 'Diagonal Norte': 2,
 'General San Martin': 3,
 'Independencia': 4,
 'Lavalle': 5,
 'Mariano Moreno': 6,
 'Retiro': 7,
 'San Juan': 8}

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 21.298949984424972


In [11]:
from sklearn.preprocessing import LabelEncoder
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 15.085640664977143


In [12]:
from xgboost import XGBRegressor

# Initialize XGBoost model
model = XGBRegressor(max_depth=50)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 15.083787526859538
