In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
january_data = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
february_data = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [3]:
def model(data): 
    
    num_columns = len(data.columns)
    print(f'The number of the column : {num_columns}')
    
    data['duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']
    data['duration'] = data['duration'].apply(lambda td: td.total_seconds()/60)
    
    duration_list = january_data['duration'].to_list()
    print(f'The standard deviation of the trips duration : {np.std(duration_list)}')

    filtered_data = data[(january_data['duration']>=1) & (data['duration']<=60)]
    initial_len = len(data)
    secondary_length = len(filtered_data)
    
    fraction_retained = (secondary_length / initial_len) * 100
    print(f'The fraction of the records left : {fraction_retained}')

    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    filtered_data[categorical] = filtered_data[categorical].astype(str)

    train_dicts = filtered_data[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dicts)

    target = 'duration'
    y_train = filtered_data[target].values

    num_features = X_train.shape[1]

    print("Feature Matrix Shape:", X_train.shape)
    print("Number of Features (Columns):", num_features)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)

    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    print("RMSE:", rmse)
 


# January Results : 

In [4]:
model(january_data)   

The number of the column : 19
The standard deviation of the trips duration : 42.59434429744777
The fraction of the records left : 98.1220282212598


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data[categorical] = filtered_data[categorical].astype(str)


Feature Matrix Shape: (3009173, 516)
Number of Features (Columns): 516
RMSE: 7.649140464167203


# February results : 

In [5]:
model(february_data)

The number of the column : 19
The standard deviation of the trips duration : 42.59434429744777


  filtered_data = data[(january_data['duration']>=1) & (data['duration']<=60)]


The fraction of the records left : 97.99966025556331


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data[categorical] = filtered_data[categorical].astype(str)


Feature Matrix Shape: (2855666, 518)
Number of Features (Columns): 518
RMSE: 8.063359809055394
