In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  # Feature Scaling


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV


In [57]:
df=pd.read_csv("/home/shahabas/shahabas/hackathon/Metadata/train.csv")

In [58]:
df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
0,748.0,2.75,1.0,75.0,24,6.3,105.3,0
1,1187.0,3.43,1.0,105.0,24,13.2,142.2,0
2,730.0,3.12,1.0,71.25,0,26.625,97.875,1
3,671.0,5.63,3.0,90.0,0,9.75,99.75,0
4,329.0,2.09,1.0,45.0,12,13.2,70.2,0


In [59]:
df.drop_duplicates(inplace=True)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205348 entries, 0 to 209672
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   trip_duration       205348 non-null  float64
 1   distance_traveled   205348 non-null  float64
 2   num_of_passengers   205348 non-null  float64
 3   fare                205348 non-null  float64
 4   tip                 205348 non-null  int64  
 5   miscellaneous_fees  205348 non-null  float64
 6   total_fare          205348 non-null  float64
 7   surge_applied       205348 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 14.1 MB


In [61]:
df.describe()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
count,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0
mean,1189.102226,5.123167,1.298654,100.656568,13.24756,15.294384,129.198512,0.285803
std,4824.30216,126.528223,0.939258,86.142902,20.507879,12.621609,99.283531,0.451797
min,0.0,0.02,0.0,0.0,0.0,-0.5,0.0,0.0
25%,454.0,1.98,1.0,52.5,0.0,6.0,73.125,0.0
50%,716.0,3.25,1.0,78.75,9.0,9.75,103.5,0.0
75%,1110.0,5.81,1.0,116.25,20.0,26.525,153.45,1.0
max,86387.0,57283.91,9.0,4466.25,2500.0,435.0,4472.25,1.0


In [62]:
# import ydata_profiling as pf
# report=pf.ProfileReport(df,dark_mode=True)
# report.to_file("report.html")

In [63]:
df=df.drop(columns="fare",axis=1)

In [64]:
df.duplicated().sum()

0

In [65]:
df['num_of_passengers'].value_counts()

1.0    174959
2.0     17847
5.0      4712
6.0      3379
3.0      2757
4.0       908
0.0       751
8.0        25
9.0         8
7.0         2
Name: num_of_passengers, dtype: int64

In [66]:
df['num_of_passengers']=df['num_of_passengers'].replace(0.0,1.0)


In [67]:
df['tip']=df['tip'].round(2)
df['miscellaneous_fees']=df['miscellaneous_fees'].round(2)
df['total_fare']=df['total_fare'].round(2)

In [68]:
df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,tip,miscellaneous_fees,total_fare,surge_applied
0,748.0,2.75,1.0,24,6.3,105.3,0
1,1187.0,3.43,1.0,24,13.2,142.2,0
2,730.0,3.12,1.0,0,26.62,97.88,1
3,671.0,5.63,3.0,0,9.75,99.75,0
4,329.0,2.09,1.0,12,13.2,70.2,0


In [69]:
df['distance_traveled'].unique()

array([ 2.75,  3.43,  3.12, ..., 51.58, 59.88, 33.72])

In [70]:
num_pipeline=Pipeline(
    steps=[
    ("imputer",SimpleImputer(strategy='median')),
    ("scaler",StandardScaler())
    ]
)

In [71]:
df.columns

Index(['trip_duration', 'distance_traveled', 'num_of_passengers', 'tip',
       'miscellaneous_fees', 'total_fare', 'surge_applied'],
      dtype='object')

In [72]:
num_col=['trip_duration', 'distance_traveled', 'num_of_passengers', 'tip','miscellaneous_fees', 'surge_applied']

In [73]:
x=df.drop(columns="total_fare",axis=1)

In [74]:
y=df.total_fare

In [75]:
preprocessor=ColumnTransformer([
    ("numerical_pipeline",num_pipeline,num_col)
])

In [76]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

In [77]:
y_train

21064     154.12
74499      92.70
207048    107.25
70857      52.20
22472      99.75
           ...  
121465     56.55
104856     43.50
133809     36.00
149160     94.95
123598    613.88
Name: total_fare, Length: 143743, dtype: float64

In [78]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())


In [79]:
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [80]:
params={
    "n_estimators":[50,100,200],
    "criterion":['absolute_error', 'poisson', 'friedman_mse', 'squared_error'],
    "max_depth":[3,5,10]
     
}

In [1]:
cv=RandomizedSearchCV(estimator=RandomForestRegressor(),param_distributions=params,cv=3,verbose=3)

NameError: name 'RandomizedSearchCV' is not defined

In [2]:
cv.fit(x_train,y_train)

NameError: name 'cv' is not defined

In [89]:
from sklearn.metrics import get_scorer_names

valid_scoring_options = get_scorer_names()
print(valid_scoring_options)


['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weight

In [25]:
def eval_model(true,predict):
    mse=mean_squared_error(true,predict)
    mae=mean_absolute_error(true,predict)
    rmse=np.sqrt(mse)
    accuracy=r2_score(true,predict)
    return mse,mae,rmse,accuracy

In [26]:
models={
    "RandomForest":RandomForestRegressor(),
    # "SVM_Regression":SVR()

}

model_list=[]
r2_list=[]


for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(x_train,y_train)


    y_pred=model.predict(x_test)
    mse,mae,rmse,accuracy=eval_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score:",accuracy*100)

    r2_list.append(accuracy)
    
    print('='*35)
    print('\n')

