In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Lasso,RidgeCV,ElasticNet
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  # Feature Scaling


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV


In [3]:
df=pd.read_csv("/home/shahabas/shahabas/hackathon/Metadata/train.csv")

In [4]:
df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
0,748.0,2.75,1.0,75.0,24,6.3,105.3,0
1,1187.0,3.43,1.0,105.0,24,13.2,142.2,0
2,730.0,3.12,1.0,71.25,0,26.625,97.875,1
3,671.0,5.63,3.0,90.0,0,9.75,99.75,0
4,329.0,2.09,1.0,45.0,12,13.2,70.2,0


In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205348 entries, 0 to 209672
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   trip_duration       205348 non-null  float64
 1   distance_traveled   205348 non-null  float64
 2   num_of_passengers   205348 non-null  float64
 3   fare                205348 non-null  float64
 4   tip                 205348 non-null  int64  
 5   miscellaneous_fees  205348 non-null  float64
 6   total_fare          205348 non-null  float64
 7   surge_applied       205348 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 14.1 MB


In [7]:
df.describe()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
count,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0,205348.0
mean,1189.102226,5.123167,1.298654,100.656568,13.24756,15.294384,129.198512,0.285803
std,4824.30216,126.528223,0.939258,86.142902,20.507879,12.621609,99.283531,0.451797
min,0.0,0.02,0.0,0.0,0.0,-0.5,0.0,0.0
25%,454.0,1.98,1.0,52.5,0.0,6.0,73.125,0.0
50%,716.0,3.25,1.0,78.75,9.0,9.75,103.5,0.0
75%,1110.0,5.81,1.0,116.25,20.0,26.525,153.45,1.0
max,86387.0,57283.91,9.0,4466.25,2500.0,435.0,4472.25,1.0


In [7]:
# import ydata_profiling as pf
# report=pf.ProfileReport(df,dark_mode=True)
# report.to_file("report.html")

In [8]:
df=df.drop(columns="fare",axis=1)

In [9]:
df.duplicated().sum()

0

In [10]:
df['num_of_passengers'].value_counts()

1.0    174959
2.0     17847
5.0      4712
6.0      3379
3.0      2757
4.0       908
0.0       751
8.0        25
9.0         8
7.0         2
Name: num_of_passengers, dtype: int64

In [11]:
df['num_of_passengers']=df['num_of_passengers'].replace(0.0,1.0)


In [12]:
df['tip']=df['tip'].round(2)
df['miscellaneous_fees']=df['miscellaneous_fees'].round(2)
df['total_fare']=df['total_fare'].round(2)

In [13]:
df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,tip,miscellaneous_fees,total_fare,surge_applied
0,748.0,2.75,1.0,24,6.3,105.3,0
1,1187.0,3.43,1.0,24,13.2,142.2,0
2,730.0,3.12,1.0,0,26.62,97.88,1
3,671.0,5.63,3.0,0,9.75,99.75,0
4,329.0,2.09,1.0,12,13.2,70.2,0


In [14]:
df['distance_traveled'].unique()

array([ 2.75,  3.43,  3.12, ..., 51.58, 59.88, 33.72])

In [13]:
num_pipeline=Pipeline(
    steps=[
    ("imputer",SimpleImputer(strategy='median')),
    ("scaler",StandardScaler())
    ]
)

In [14]:
df.columns

Index(['trip_duration', 'distance_traveled', 'num_of_passengers', 'tip',
       'miscellaneous_fees', 'total_fare', 'surge_applied'],
      dtype='object')

In [15]:
num_col=['trip_duration', 'distance_traveled', 'num_of_passengers', 'tip','miscellaneous_fees', 'surge_applied']

In [16]:
x=df.drop(columns="total_fare",axis=1)

In [17]:
y=df.total_fare

In [18]:
preprocessor=ColumnTransformer([
    ("numerical_pipeline",num_pipeline,num_col)
])

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

In [22]:
y_train

21064     154.12
74499      92.70
207048    107.25
70857      52.20
22472      99.75
           ...  
121465     56.55
104856     43.50
133809     36.00
149160     94.95
123598    613.88
Name: total_fare, Length: 143743, dtype: float64

In [20]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())


In [21]:
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [29]:
params={
    "n_estimators":[50,100,200],
    # "criterion":['absolute_error', 'poisson', 'friedman_mse', 'squared_error'],
    "max_depth":[5,10,20]
     
}

In [30]:
cv=RandomizedSearchCV(estimator=RandomForestRegressor(),param_distributions=params,cv=3,verbose=3)

In [31]:
cv.fit(x_train,y_train)



Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END ......max_depth=5, n_estimators=50;, score=0.748 total time=   4.6s
[CV 2/3] END ......max_depth=5, n_estimators=50;, score=0.765 total time=   4.3s
[CV 3/3] END ......max_depth=5, n_estimators=50;, score=0.834 total time=   4.3s
[CV 1/3] END .....max_depth=5, n_estimators=100;, score=0.749 total time=   8.6s
[CV 2/3] END .....max_depth=5, n_estimators=100;, score=0.761 total time=   8.6s
[CV 3/3] END .....max_depth=5, n_estimators=100;, score=0.838 total time=   8.6s
[CV 1/3] END .....max_depth=5, n_estimators=200;, score=0.750 total time=  17.2s
[CV 2/3] END .....max_depth=5, n_estimators=200;, score=0.761 total time=  17.2s
[CV 3/3] END .....max_depth=5, n_estimators=200;, score=0.835 total time=  17.1s
[CV 1/3] END .....max_depth=10, n_estimators=50;, score=0.786 total time=   8.0s
[CV 2/3] END .....max_depth=10, n_estimators=50;, score=0.777 total time=   8.0s
[CV 3/3] END .....max_depth=10, n_estimators=50;,

In [32]:
cv.best_params_

{'n_estimators': 100, 'max_depth': 10}

In [25]:
def eval_model(true,predict):
    mse=mean_squared_error(true,predict)
    mae=mean_absolute_error(true,predict)
    rmse=np.sqrt(mse)
    accuracy=r2_score(true,predict)
    return mse,mae,rmse,accuracy

In [30]:
estimators = [ ("random",RandomForestRegressor(n_estimators=100,max_depth=10,random_state=42)),('svr', LinearSVR(random_state=42))]

In [31]:
models={
    "stackingRegressor":StackingRegressor(estimators=estimators,final_estimator=RandomForestRegressor(n_estimators=100,max_depth=10,random_state=42))
    # "SVM_Regression":SVR()

}

model_list=[]
r2_list=[]


for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(x_train,y_train)


    y_pred=model.predict(x_test)
    mse,mae,rmse,accuracy=eval_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score:",accuracy*100)

    r2_list.append(accuracy)
    
    print('='*35)
    print('\n')



stackingRegressor
Model Training Performance
RMSE: 40.86176924969857
MAE: 7.456754272716169
R2 score: 82.58774247944464




In [49]:
test_data=("/home/shahabas/shahabas/hackathon/Metadata/test.csv")
test_df=pd.read_csv(test_data)

In [50]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89861 entries, 0 to 89860
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   trip_duration       89861 non-null  float64
 1   distance_traveled   89861 non-null  float64
 2   num_of_passengers   89861 non-null  float64
 3   fare                89861 non-null  int64  
 4   tip                 89861 non-null  int64  
 5   miscellaneous_fees  89861 non-null  float64
 6   total_fare          89861 non-null  int64  
 7   surge_applied       89861 non-null  int64  
dtypes: float64(4), int64(4)
memory usage: 5.5 MB


In [51]:
test_df.head()

Unnamed: 0,trip_duration,distance_traveled,num_of_passengers,fare,tip,miscellaneous_fees,total_fare,surge_applied
0,1076.0,4.18,1.0,0,0,13.5,0,0
1,429.0,1.48,4.0,0,0,13.5,0,0
2,856.0,4.15,1.0,0,24,6.0,0,0
3,622.0,3.22,1.0,0,15,5.625,0,0
4,507.0,3.98,1.0,0,0,2.25,0,0


In [52]:
test_df=pd.DataFrame(preprocessor.transform(test_df),columns=preprocessor.get_feature_names_out())
y_pred=model.predict(test_df)

In [54]:
y_pred.shape

(89861,)

In [57]:
y_pred[2]

119.48647221732087

In [66]:
with open("/home/shahabas/shahabas/hackathon/Metadata/submission.csv","w") as file:
    for i in y_pred:
        file.write(str(round(i,2))+'\n')

In [61]:
for i in y_pred:
    

107.90324074764021
58.37213908350786
119.48647221732087
90.13251847512245
106.04372263878724
180.61318943045535
53.904804448905324
79.52662907765269
69.55633374037862
199.2706511167922
86.8633179658752
49.32424350490971
87.39207310045515
103.29952486987928
101.55254499363467
84.70499988819543
172.22320942143688
102.87582372335393
134.02310317476852
399.75784352610503
250.76513434912434
77.74026222879883
608.3272515429518
156.7538200879225
74.90824464572056
78.30815698802277
213.1754631446416
197.15984140228102
194.28021460440323
142.7459412585772
238.70597645724908
41.70361085624334
417.8221504780361
79.00667169037335
97.18275039331513
211.26744680263852
121.62995336804572
65.4504208303799
310.8020323862082
107.28913035683802
52.29664118635218
156.0050241131596
121.38345543275193
63.2140038634027
55.15403601420806
162.94155076438983
51.68297611941034
62.71788459638907
48.05663688801006
60.578654610403945
101.27624197410502
104.41953415100159
168.76648293807114
132.28642329570994
80.266