In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df_1 = pd.read_parquet('green_tripdata_2023-01.parquet')
df_2 = pd.read_parquet('green_tripdata_2023-02.parquet')
df = pd.concat([df_1, df_2])

In [None]:
df.head(5)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.shape[0]

In [None]:
df.shape

## Target prepration

> Our task is to predict the duration of the cab ride. So we will deduce it by two columns duration = tpep_dropoff_datetime - tpep_pickup_datetime

In [10]:
df['duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds()

In [None]:
df['duration'] = df['duration']/60

df.head(5)


In [None]:
df = df[(df['duration'] >=1 ) & (df['duration'] <=60)]

df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

## Data Preparation

In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer


def iterative_imputer(df:pd.DataFrame, subset_col:str, estimator = None, max_iter: int = 10, tol:float = 1e-3) -> pd.DataFrame:

    """
        Imputes missing values in a specified column of a DataFrame using IterativeImputer.

        Parameters:
        df (pd.DataFrame): The DataFrame containing the column to impute.
        subset_col (str): The name of the column in df to impute.

        Returns:
        pd.DataFrame: The DataFrame with imputed values in the subset_col.

        Note:
        This function assumes that subset_col is numerical. If subset_col is categorical, it should be encoded as numerical values before using this function.
        IterativeImputer can be computationally expensive for large datasets. If runtime is a concern, consider using other imputation methods or using a subset of your data.
    """
    
    imputer = IterativeImputer(estimator=estimator, max_iter=max_iter, tol=tol)
    imputed_values = imputer.fit_transform(df[[subset_col]])

    df[subset_col] = pd.DataFrame(imputed_values, columns = [subset_col], index=df.index)

    return df

def KNN_imputer(df:pd.DataFrame, subset_col:str, n:int = 5) -> pd.DataFrame:

    """
        Imputes missing values in a specified column of a DataFrame using K-Nearest Neighbors.

        Parameters:
        df (pd.DataFrame): The DataFrame containing the column to impute.
        subset_col (str): The name of the column in df to impute.
        n (int, optional): The number of neighbors to use for KNN imputation. Defaults to 5.

        Returns:
        pd.DataFrame: The DataFrame with imputed values in the subset_col.

        Note:
        This function assumes that subset_col is numerical. If subset_col is categorical, it should be encoded as numerical values before using this function.
        KNN imputation can be computationally expensive for large datasets. If runtime is a concern, consider using other imputation methods or using a subset of your data.

    """

    imputer = KNNImputer(n_neighbors = n)
    imputed_values = imputer.fit_transform(df[[subset_col]])

    df[subset_col] = pd.DataFrame(imputed_values, columns = subset_col, index=df.index)

    return df



>Several columns in our dataset contain missing values. To understand their impact on the 'duration' column, which is also our target column, we can visualize this relationship using various graphs. This will potentially reveal valuable insights, such as the importance of certain variables. It's worth noting that imputation methods, used to handle missing data, can be computationally expensive. Therefore, we need to approach this process with caution, as it will affect both the cost and time efficiency of our data analysis

In [1]:
# df = df.drop_duplicates()

In [17]:
# df = df.reset_index(drop=True)

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,8))

sns.boxplot(x='store_and_fwd_flag', y = 'duration', data= df)
plt.title('store_and_fwd_flag vs Duration')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()



### Infrences
    1. The distribution of duration for both Y and N columns are identical having median around 10 minutes, and almost identical intequartile ranges
    2. Outliers - simmilar outliers exgtending upto 60 minutes

### Conclusion 
    It is not giving any meaningful info about the data. The trips seems to have very simmilar duration patterns

In [None]:
numerical_column = ['duration','passenger_count', 'RatecodeID', 'congestion_surcharge', 'Airport_fee']
corr_matrix = df[numerical_column].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

### Infrences

    1. Airport Fee is moderatley correlated with duration -- 0.47 -- Need to keep this column
    2. Congestion_charge has negative impact on duration -- -0.18 -- Can keep, but won't see any major impact
    3. RateCode has 0.17 impact on duration. Better to remove this
    4. Passenger_Count to duration has no relation. So we can convininetly ignore this column
    5. congestion charge to rate_code is -0.26, tells about relationship between pricing schemes and traffic conditions
    6. congestion has negative relation with airport fees



### Conclusion
    Columns to keep - Airport_fee and Congestion_charge
    Columns to drop - Passenger_count and ratecode_id


In [None]:
df.head()

In [23]:
df = df.drop(['passenger_count', 'RatecodeID', 'store_and_fwd_flag' ], axis =1)


In [None]:
df.head(60000)

In [None]:
df.isna().sum()

In [26]:
def classify_time_of_day(time):
    hour = time.hour

    if 4<=hour<10:
        return "Morning"
    
    elif 10<=hour<16:
        return "Afternoon"
    
    elif 16<=hour<22:
        return "Evening"
    
    else: 
        return "Night"
    

In [27]:
df['pickuptime'] = df['tpep_pickup_datetime'].apply(classify_time_of_day)
df['droptime'] = df['tpep_dropoff_datetime'].apply(classify_time_of_day)

In [None]:
df.head()

In [None]:
df = iterative_imputer(df,subset_col='Airport_fee')
print("OneColumn Done")
df = iterative_imputer(df,subset_col='congestion_surcharge')

In [None]:
df.isna().sum()

In [31]:
df = df.drop([ 'tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis =1)

In [None]:
df.head()

In [None]:
df.info()

In [34]:
def convert_int32(df):
    int32_columns = df.select_dtypes(include=['int32']).columns

    for col in int32_columns:
        df[col] = df[col].astype('int64')
    

    return df



df = convert_int32(df)

In [None]:
df.info()

## Exploratory Data Analysis


In [36]:
class EDA:

    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.numeric_df = df.select_dtypes(include=['int64','float64'])



    def create_heatmap(self) -> pd.DataFrame:  
        corr_matrix = self.numeric_df.corr()
        plt.figure(figsize=(10,10))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1, linewidths=0.5)
        plt.title("Correlation Matrix")
        plt.show()
        return corr_matrix



    def create_histplot(self) -> None:
        for col in self.numeric_df:
            plt.figure(figsize=(2,2))
            sns.histplot(data=self.df,x=col,kde=True)
            plt.title(f'Distribution of {col}') 
            plt.xlabel(col)
            plt.ylabel('frequency') 
            plt.show()      


In [37]:
eda = EDA(df)


In [None]:
corr_matrix= eda.create_heatmap()


In [None]:
corr_matrix

### Inference from Heatmap

    1. Few features have high relation with our target coulmn
        a. fare_amount to duration ~ 0.8
        b. tip_amount to duration ~ 0.52
        c. total_amount to duration ~ 0.78
    
    2. Few fetaures have good relation with our target column
        a. tolls_amount to duration ~ 0.45
        b. Airport_fee to duration ~ 0.46
    
    3. Other features have relation with Target but not so high
        a. extra  has positive relation with target column ~ [0.1,0.2]
        b. congestion_surcharge, PULocationID, DOLocationID has negative relation with target column ~ [-0.2,-0.1]
    
    4. All other have very less significant relation with target
        a. trip_distance, improvement_surcharge  < 0.1
        b. VendorID, payment_type, mta_tax <-0.1
    
     
    5. From the above heatmap we can indentify few highly correlated features
         a. fare_amount and tip_amount ~ 0.61
        *b. fare_amount and tolls_amount ~ 0.63
         c. fare_amount and total_amount ~ 0.98
        *d. fare_amount and airport_fee ~ 0.63
         e. tip_amount and tolls_amount ~ 0.49
         f. tip_amount and total_amount ~ 0.73
        *g. tip_amount and airport_fee ~ 0.43
        *h. tolls_amount and airport_fee ~ 0.48
        *i. tolls_amount and total_amount ~ 0.71
        *j. total_amount and airport_fee ~ 0.65


>Note : "*" have been put in front of those where the impact of either or both the features is less than 0.5 which is standard

#### 1. ***Handling the Highly Correlated Features***
    upon analysing the correlation matrix, several pairs displayed high correlations coeff. indicating possible multicolinearity, which has negative impact on model performance. To mitigate the issue e created new feautures from that capture relationship between these correlated variables and dropped the orignal features.


> Before Applying PCA for feature reduction we need to apply Encoding, then as an additional step we use Boruta selection algorithm. Then PCA





In [None]:
df.info()

In [43]:
#One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = ['pickuptime','droptime']

columntransformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), categorical_cols)], remainder='passthrough')


encoded_df = columntransformer.fit_transform(df)

encoded_df = pd.DataFrame(encoded_df, columns=columntransformer.get_feature_names_out())

In [None]:
encoded_df.info()

In [None]:
encoded_df.head()

### Boruta Feature Selection

    Boruta is an all relevant feature selection method, while most other are minimal optimal; this means it tries to find all features carrying information usable for prediction, rather than finding a possibly compact subset of features on which some classifier has a minimal error.

In [None]:
print(encoded_df.shape)
print(df['duration'].shape)

encoded_df.isna().sum()

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs =-1, max_depth = 5)

X = encoded_df.drop(columns=['remainder__duration']).values
y= encoded_df['remainder__duration'].values

boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=42, max_iter=100, verbose=0)


print("Satarting Fittin")
boruta_selector.fit(X,y)
print("Fitting Done")



In [62]:
selected_feature = encoded_df.drop(columns=['remainder__duration']).columns[boruta_selector.ranking_ <=2]

selected_df = encoded_df[selected_feature]

selected_df['remainder__duration'] = encoded_df['remainder__duration']

In [None]:
selected_df.head()

In [None]:
selected_df.info()

In [65]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


features = selected_df.drop(columns=['remainder__duration'])

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

pca = PCA(n_components=5)
pca_features = pca.fit_transform(scaled_features)

pca_df = pd.DataFrame(pca_features, columns=[f'PCA{i+1}' for i in range(pca.n_components_)])
df = pd.concat([selected_df[['remainder__duration']], pca_df], axis =1)

In [None]:
df.head(5)

## Modelling 
    It all comes down to the Modelling part. As the data is big and complex we were tempted to use a boosting algorithm


In [67]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix, classification_report


X = selected_df.drop(columns=['remainder__duration'])
y = selected_df['remainder__duration']


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [68]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3,random_state=42)


In [None]:
gbr.fit(X_train,y_train)

In [70]:
y_pred = gbr.predict(X_test)


In [71]:
mse = mean_squared_error(y_test,y_pred)

In [None]:
mse

In [73]:
r2 = r2_score(y_test, y_pred)

In [None]:
r2

In [None]:
from sklearn.metrics import root_mean_squared_error


root_mean_squared_error(y_test, y_pred)