In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv('dataset.csv')
df.head()

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df['stops'] = df['stops'].replace({'zero': 0, 'one': 1, 'two_or_more': 2}).astype('int64')

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
for col in ['airline', 'flight', 'source_city', 'destination_city', 'class']:
    print(df[col].value_counts())
    print()

In [None]:
df.groupby('airline')['price'].mean().round(2)

In [None]:
df.groupby('airline')['price'].mean().round(2)

In [None]:
sns.histplot(np.log1p(df['price']))

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
df.head()

In [None]:
categorical = ['airline', 'flight', 'source_city', 'source_city', 'arrival_time', 'destination_city', 'class']
for col in categorical:
    print(f"{col:<20}{mutual_info_score(df['price'], df[col]):>10}")

In [None]:
df[['stops', 'duration', 'days_left']].corrwith(df['price'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
y_train = np.log1p(df_train['price'].values)
y_val = np.log1p(df_val['price'].values)
y_test = np.log1p(df_test['price'].values)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
del df_train['price']
del df_val['price']
del df_test['price']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error as rmse

In [None]:
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
rmse(y_val, y_pred)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_val)
rmse(y_val, y_pred)

In [None]:
for m in [20, 22, 25, 27, 30, 35, 37, 40]:
    dt = DecisionTreeRegressor(max_depth=m)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    print(f'Max Depth: {m} Score: {rmse(y_val, y_pred)}')

In [None]:
for m in [1, 2, 3, 4, 6, 7, 8, 9, 10]:
    dt = DecisionTreeRegressor(max_depth=35, min_samples_leaf=m)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    print(f'Sample: {m} Score: {rmse(y_val, y_pred)}')

In [None]:
dt = DecisionTreeRegressor(max_depth=35, min_samples_leaf=4)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_val)
rmse(y_val, y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)


y_pred = rf.predict(X_val)
rmse(y_val, y_pred)

In [None]:
for n in range(10, 41, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=2)
    rf.fit(X_train, y_train)
    
    
    y_pred = rf.predict(X_val)
    print(f'Est: {n} Score: {rmse(y_val, y_pred)}')

In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=2)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse(y_val, y_pred)

In [None]:
import xgboost as xgb

In [None]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval= xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [None]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [None]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 20,
    'min_child_weight': 5,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [None]:
xgb_model = xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [None]:
y_full_train = np.log1p(df_full_train['price'].values)

In [None]:
df_full_train = df_full_train.reset_index(drop=True)
del df_full_train['price']

In [None]:
full_train_dict = df_full_train.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

X_full_train = dv.fit_transform(full_train_dict)
X_test = dv.transform(test_dict)

In [None]:
features = list(dv.get_feature_names_out())
d_full_train = xgb.DMatrix(X_full_train, label=y_full_train, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [None]:
watchlist = [(d_full_train, 'full_train'), (dtest, 'test')]

In [None]:
xgb_model = xgb.train(xgb_params, d_full_train, evals=watchlist, num_boost_round=100)