In [55]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [34]:
# load your dataset into a pandas DataFrame
df = pd.read_excel('C:/Users/Soumya Ranjan/Desktop/Workspace/UCI/Winter/211/Flight_Price_Prediction/t.xlsx')
# define the columns containing string values
string_cols = ['Airline', 'Date_of_Journey', 'Source','Destination','Route','Dep_Time','Arrival_Time','Duration','Total_Stops','Additional_Info']

# create a LabelEncoder object for each string column
label_encoders = {}
for col in string_cols:
    le = LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
    label_encoders[col] = le

# save the label encoders for future use
import pickle
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [35]:
# separate the price column from the rest of the data
X = df.drop('Price', axis=1)
y = df['Price']

In [36]:
X

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,3,24,0,5,18,211,233,240,4,8
1,1,6,3,0,84,31,906,336,1,8
2,4,43,2,1,118,70,413,106,1,8
3,3,10,3,0,91,164,1324,311,0,8
4,3,0,0,5,29,149,1237,303,0,8
...,...,...,...,...,...,...,...,...,...,...
10678,0,41,3,0,64,183,1259,236,4,8
10679,1,29,3,0,64,193,1305,237,4,8
10680,4,29,0,2,18,58,824,280,4,8
10681,10,0,0,5,18,92,938,238,4,8


In [32]:
y

0         3897
1         7662
2        13882
3         6218
4        13302
         ...  
10678     4107
10679     4145
10680     7229
10681    12648
10682    11753
Name: Price, Length: 10683, dtype: int64

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [46]:
rf.fit(X_train,y_train)

RandomForestRegressor(random_state=42)

In [47]:
y_pred = rf.predict(X_test)

In [53]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean squared error:', mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R2 score:", r2)

Mean squared error: 2415641.287413655
RMSE: 1554.2333439395948
MAE: 786.2844852578529
R2 score: 0.8858606519657815


In [56]:
# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth': [2, 4, 8, None],
    'max_features': ['auto', 'sqrt']
}

# Create a Random Forest Classifier object
rfr = RandomForestRegressor()

# Use Grid Search to find the best hyperparameters
grid_search = GridSearchCV(rfr, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)


# print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# evaluate the best model on the test data
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test score:", test_score)

Best parameters: {'max_depth': None, 'max_features': 'auto', 'n_estimators': 500}
Best score: 0.8520823326624569
Test score: 0.8802168754795228


In [77]:
#Let's stick with these parameters for our training model and import our test dataset now.
df2 = pd.read_excel('C:/Users/Soumya Ranjan/Desktop/Workspace/UCI/Winter/211/Flight_Price_Prediction/v.xlsx')
string_cols = ['Airline', 'Date_of_Journey', 'Source','Destination','Route','Dep_Time','Arrival_Time','Duration','Total_Stops','Additional_Info']

# create a LabelEncoder object for each string column
label_encoders = {}
for col in string_cols:
    le = LabelEncoder()
    le.fit(df2[col])
    df2[col] = le.transform(df2[col])
    label_encoders[col] = le

# save the label encoders for future use
import pickle
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [78]:
X_pred=df2

In [79]:
rf = RandomForestRegressor(n_estimators=500, random_state=42)

In [80]:
rf.fit(X,y)

RandomForestRegressor(n_estimators=500, random_state=42)

In [81]:
y_pred=rf.predict(X_pred)

In [82]:
y_pred

array([ 8171.326     ,  5876.124     ,  9030.5641    , ...,
        8451.422     , 11514.714     ,  9524.09916667])