<a href="https://colab.research.google.com/github/sardorbekhasan/air-ticket-price-prediction/blob/main/first_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [179]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [180]:
df = pd.read_csv('train_data.csv')
df.head()

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


In [181]:
feature_columns = ['stops', 'class', 'duration', 'days_left']
target_column = 'price'
transformer = FunctionTransformer(lambda x: x.reshape(-1, 1), validate=True)

In [182]:
preprocessor = ColumnTransformer(
    transformers=[
        # Map categorical values for 'stops' column and reshape
        ('stops', Pipeline([
            ('map', FunctionTransformer(lambda x: x.replace({'zero': 0, 'one': 1, 'two_or_more': 2}), validate=False)),
            ('reshape', transformer)
        ]), ['stops']),
        # Map categorical values for 'class' column and reshape
        ('class', Pipeline([
            ('map', FunctionTransformer(lambda x: x.replace({'Economy': 0, 'Business': 1}), validate=False)),
            ('reshape', transformer)
        ]), ['class']),
        # Normalize 'duration' and 'days_left' columns
        ('duration_scaler', StandardScaler(), ['duration']),
        ('days_left_scaler', StandardScaler(), ['days_left'])
    ], remainder='passthrough'
)

In [183]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial_features', PolynomialFeatures(degree=4, include_bias=False)),
    ('RF_model', RandomForestRegressor())
])

In [184]:
X = df[feature_columns]
y = df[target_column]

In [185]:
model.fit(X,y)

In [186]:
y_predict = model.predict(X)

In [187]:
df1 = pd.DataFrame({'Actual': y, 'Predicted': y_predict.round().astype(int)})
df1.head(10)

Unnamed: 0,Actual,Predicted
0,7212,6473
1,5292,5318
2,60553,57357
3,5760,5381
4,10712,8873
5,45257,46280
6,5054,5315
7,32923,32699
8,11383,11007
9,4357,6076


In [188]:
print('Mean absolute error:', np.mean(np.absolute(y_predict - y)))
print('R2-score:', model.score(X, y))

Mean absolute error: 1898.9449691967611
R2-score: 0.9793986508401379


In [189]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y, y_predict)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

3268.918719600272


In [190]:
test_df = pd.read_csv('test_data.csv')
test_df.head()

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
1,2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
2,3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.0,30
3,4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
4,5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35


In [191]:
X = test_df[feature_columns]
y_predict = model.predict(X)

In [192]:
result_df = pd.DataFrame({'id':np.arange(1, len(y_predict)+1), 'Price': y_predict.round().astype(int)})
result_df.head(10)

Unnamed: 0,id,Price
0,1,68455
1,2,50509
2,3,29189
3,4,4307
4,5,3844
5,6,6171
6,7,6088
7,8,14017
8,9,4711
9,10,14867


In [193]:
result_df.to_csv('result.csv', index=False)