# Importing LIbraries

In [1]:
!pip install feature-engine
!pip install matplotlib
!pip install xgboost


import numpy as np
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from feature_engine.datetime import DatetimeFeatures

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import learning_curve
from xgboost import XGBRegressor

import joblib
import matplotlib.pyplot as plt

In [2]:
#import sys
#print(sys.executable)


# 2.Display Settings

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
sklearn.set_config(transform_output="default")

# 3.Getting Data


In [5]:
train_df = pd.read_csv("train.csv")

In [6]:
train_df

In [7]:
val_df = pd.read_csv("val.csv")

In [8]:
val_df

In [9]:
test_df = pd.read_csv("test.csv")

In [10]:
test_df

# 3.1 Splitting the data

In [11]:
def split_data(data):
	X = data.drop(columns="price")
	y = data.price.copy()
	return (X, y)

In [12]:
X_train, y_train = split_data(train_df)

In [13]:
X_train

In [14]:
y_train

In [15]:
X_val, y_val = split_data(val_df)

In [16]:
print(X_val.shape, y_val.shape)

In [17]:
X_test, y_test = split_data(test_df)

In [18]:
print(X_test.shape, y_test.shape)

# 3.2 Meta Info

In [19]:
X_train.info()

# 4. Data Preprocessing

In [20]:
#Numerical Columns:

num_cols = ["duration", "total_stops"]

In [21]:
#Date-Time Columns:

dt_cols = ["date_of_journey", "dep_time", "arrival_time"]

In [22]:
#Categorical Columns:

cat_cols = [col for col in X_train.columns if (col not in dt_cols) and (col not in num_cols)]

In [23]:
cat_cols

In [24]:
num_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="median")),
	("scaler", StandardScaler())
])

In [25]:
cat_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

In [26]:
doj_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("extractor", DatetimeFeatures(features_to_extract=["month", "week", "day_of_week", "day_of_month"], format="mixed")),
	("scaler", StandardScaler())
])

In [27]:
time_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("extractor", DatetimeFeatures(features_to_extract=["hour", "minute"], format="mixed")),
	("scaler", StandardScaler())
])

In [28]:
preprocessor = ColumnTransformer(transformers=[
	("num", num_transformer, num_cols),
	("cat", cat_transformer, cat_cols),
	("doj", doj_transformer, ["date_of_journey"]),
	("time", time_transformer, ["dep_time", "arrival_time"])
])

In [29]:
preprocessor.fit_transform(X_train)



In [30]:
preprocessor.fit_transform(X_train).shape

# 5.Model Selection

In [31]:
algorithms = {
	"Linear Regression": LinearRegression(),
	"Support Vector Machine": SVR(),
	"Random Forest": RandomForestRegressor(n_estimators=10),
	"XG Boost": XGBRegressor(n_estimators=10)
}

In [32]:
data = pd.concat([train_df, val_df], axis=0)
data

In [33]:
X_data, y_data = split_data(data)
print(X_data.shape, y_data.shape)

In [34]:
def plot_curves(sizes, mean_scores, std_scores, label, ax):
	ax.plot(
		sizes,
		mean_scores,
		marker="o",
		label=label
	)

	ax.fill_between(
		x=sizes,
		y1=mean_scores - std_scores,
		y2=mean_scores + std_scores,
		alpha=0.5
	)

In [35]:
def plot_learning_curves(name, algorithm, figsize=(12, 4)):
	model = Pipeline(steps=[
		("pre", preprocessor),
		("alg", algorithm)
	])

	train_sizes, train_scores, test_scores = learning_curve(
		estimator=model,
		X=X_data,
		y=y_data,
		cv=3,
		scoring="r2",
		n_jobs=-1,
		random_state=42
	)
	
	mean_train_scores = np.mean(train_scores, axis=1)
	std_train_scores = np.std(train_scores, axis=1)
	train_score = f"{mean_train_scores[-1]:.2f} +/- {std_train_scores[-1]:.2f}"

	mean_test_scores = np.mean(test_scores, axis=1)
	std_test_scores = np.std(test_scores, axis=1)
	test_score = f"{mean_test_scores[-1]:.2f} +/- {std_test_scores[-1]:.2f}"

	fig, ax = plt.subplots(figsize=figsize)

	# training curve
	plot_curves(
		train_sizes,
		mean_train_scores,
		std_train_scores,
		f"Train ({train_score})",
		ax
	)

	# test curve
	plot_curves(
		train_sizes,
		mean_test_scores,
		std_test_scores,
		f"Test ({test_score})",
		ax
	)

	ax.set(xlabel="Training Set Size", ylabel="R-square", title=name)

	ax.legend(loc="lower right")

	plt.show()

In [36]:
for name, alg in algorithms.items():
	plot_learning_curves(name, alg)
    
#following are the learning curves for different algos:

# 6. Model Training

In [37]:
model = Pipeline(steps=[
	("pre", preprocessor),
	("rf", RandomForestRegressor(n_estimators=10))
])

In [38]:
model.fit(X_data, y_data)

# 7. Model Evaluation

In [39]:
def evaluate_model(X, y):
	y_pred = model.predict(X)
	return r2_score(y, y_pred)

In [40]:
print(f"R2 score on Training data is = {evaluate_model(X_data, y_data)}")

In [41]:
print(f"R2 score on Test data is = {evaluate_model(X_test, y_test)}")

# 8. Model Persistence

In [42]:
joblib.dump(model, "model.joblib")

In [43]:
saved_model = joblib.load("model.joblib")
saved_model

In [44]:
y_pred = saved_model.predict(X_test)

r2_score(y_test, y_pred)