## HOMEWORK-1

### IMPORTS

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.dropOff_datetime = pd.to_datetime(df.dropOff_datetime)
        df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    print(f'Distribution of Duration:\n')
    print(df['duration'].describe(percentiles=[0.90, 0.95, 0.99]))
    # Filter the dataset
    print(f'Size of dataframe before removing trips greater than 60 mins: {df.shape}')
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    print(f'Size of dataframe after removing trips greater than 60 mins: {df.shape}')
    return df

def fill_missing_values(df, col_list, fill):
    for col in col_list:
        df[col] = df[col].fillna(fill)
    return df


def convert_to_dict(df, col_list):
    for col in col_list:
        df[col] = df[col].astype(str)
    dicts = df[col_list].to_dict(orient='records')  
    return dicts

## MAIN

### TRAIN

In [4]:
df_train = read_dataframe('data/fhv_tripdata_2021-01.parquet')
categorical = ['PUlocationID', 'DOlocationID']

Distribution of Duration:

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
50%      1.340000e+01
90%      3.563333e+01
95%      4.725000e+01
99%      9.030000e+01
max      4.233710e+05
Name: duration, dtype: float64
Size of dataframe before removing trips greater than 60 mins: (1154112, 8)
Size of dataframe after removing trips greater than 60 mins: (1109826, 8)


In [5]:
# fraction of missing values
print(df_train[pd.isnull(df_train['PUlocationID'])].shape[0]/df_train.shape[0])

0.8352732770722617


In [6]:
df_train = fill_missing_values(df_train, categorical, -1)
train_dicts = convert_to_dict(df_train, categorical)
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_train.duration.values

# dimension of the matrix
X_train.shape #(1109826, 525)

(1109826, 525)

In [7]:
# Train and Predict on training data
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

10.528519107206316

### Validation

In [8]:
# Validation set predict and evaluate
df_val = read_dataframe('data/fhv_tripdata_2021-02.parquet')
df_val = fill_missing_values(df_val, categorical, -1)

Distribution of Duration:

count    1.037692e+06
mean     2.070699e+01
std      1.611084e+02
min      1.666667e-02
50%      1.410000e+01
90%      3.870000e+01
95%      5.225000e+01
99%      1.050000e+02
max      1.109190e+05
Name: duration, dtype: float64
Size of dataframe before removing trips greater than 60 mins: (1037692, 8)
Size of dataframe after removing trips greater than 60 mins: (990113, 8)


In [9]:
val_dicts = convert_to_dict(df_val, categorical)
X_val = dv.transform(val_dicts)
y_val = df_val.duration.values
# dimension of the matrix
X_val.shape #(990113, 525)

(990113, 525)

In [10]:
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

11.014283149347039