In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('Clean_Dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
300148,300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.0,49,81585
300152,300152,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,81585


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [6]:
df.dtypes

Unnamed: 0            int64
airline              object
flight               object
source_city          object
departure_time       object
stops                object
arrival_time         object
destination_city     object
class                object
duration            float64
days_left             int64
price                 int64
dtype: object

In [7]:
from sklearn.model_selection import train_test_split

In [11]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [10]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        
    def mse(self, y):
        return np.mean((y - np.mean(y)) ** 2)
    
    def split(self, X, y, idx, split_val):
        left_idx = np.argwhere(X[:, idx] <= split_val).flatten()
        right_idx = np.argwhere(X[:, idx] > split_val).flatten()
        
        if len(left_idx) == 0 or len(right_idx) == 0:
            return None, None
        
        left_X, left_y = X[left_idx], y[left_idx]
        right_X, right_y = X[right_idx], y[right_idx]
        
        left_mse, right_mse = self.mse(left_y), self.mse(right_y)
        return left_X, left_y, right_X, right_y, left_mse + right_mse
    
    def find_best_split(self, X, y):
        best_idx, best_val, best_mse = None, None, np.inf
        
        for idx in range(X.shape[1]):
            for val in np.unique(X[:, idx]):
                left_X, left_y, right_X, right_y, mse = self.split(X, y, idx, val)
                if mse is not None and mse < best_mse:
                    best_idx, best_val, best_mse = idx, val, mse
        
        return best_idx, best_val, best_mse
    
    def fit(self, X, y, depth=0):
        if self.max_depth is not None and depth >= self.max_depth:
            self.left, self.right = None, None
            self.prediction = np.mean(y)
            return
        
        idx, val, mse = self.find_best_split(X, y)
        if idx is None:
            self.left, self.right = None, None
            self.prediction = np.mean(y)
            return
        
        left_X, left_y, right_X, right_y, _ = self.split(X, y, idx, val)
        self.feature_idx = idx
        self.split_val = val
        self.left = DecisionTreeRegressor(max_depth=self.max_depth)
        self.right = DecisionTreeRegressor(max_depth=self.max_depth)
        self.left.fit(left_X, left_y, depth=depth+1)
        self.right.fit(right_X, right_y, depth=depth+1)
    
    def predict(self, X):
        if self.left is None and self.right is None:
            return self.prediction
        
        if X[self.feature_idx] <= self.split_val:
            return self.left.predict(X)
        else:
            return self.right.predict(X)

In [12]:
class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        for i in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X, y)
            self.trees.append(tree)

    def predict(self, X):
        predictions = []
        for tree in self.trees:
            predictions.append(tree.predict(X))
        return np.mean(predictions, axis=0)

In [13]:
df.dtypes

Unnamed: 0            int64
airline              object
flight               object
source_city          object
departure_time       object
stops                object
arrival_time         object
destination_city     object
class                object
duration            float64
days_left             int64
price                 int64
dtype: object

In [14]:
from sklearn.preprocessing import LabelEncoder

#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['airline'] = lab.fit_transform(df['airline'])

In [15]:
#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['flight'] = lab.fit_transform(df['flight'])

In [16]:
#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['source_city'] = lab.fit_transform(df['source_city'])

In [17]:
#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['departure_time'] = lab.fit_transform(df['departure_time'])

In [18]:
#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['stops'] = lab.fit_transform(df['stops'])

In [19]:
#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['arrival_time'] = lab.fit_transform(df['arrival_time'])

In [20]:
#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['destination_city'] = lab.fit_transform(df['destination_city'])

In [21]:
#create instance of label encoder
lab = LabelEncoder()

#perform label encoding on 'team' column
df['class'] = lab.fit_transform(df['class'])

In [22]:
df.dtypes

Unnamed: 0            int64
airline               int32
flight                int32
source_city           int32
departure_time        int32
stops                 int32
arrival_time          int32
destination_city      int32
class                 int32
duration            float64
days_left             int64
price                 int64
dtype: object

In [24]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [25]:
from sklearn.tree import DecisionTreeRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [26]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE: ", mse)
print("R-squared: ", r2)

MSE:  6613899.686907307
R-squared:  0.9870650088823884
