In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_data = pd.read_csv('../input/flight-prices/Data_Train.csv')
train_data.head()

In [None]:
# to find all missing values in a column
train_data.isna().sum()

In [None]:
train_data.shape

In [None]:
#drop missing values
train_data.dropna(inplace = True)

In [None]:
train_data.isna().sum()

In [None]:
#data type of each column
train_data.dtypes

In [None]:
#function to change datatype to datetime format
def change_into_datetime(col):
    train_data[col] = pd.to_datetime(train_data[col])

# these three columns have dates so need to be changed
for i in ['Date_of_Journey','Dep_Time','Arrival_Time']:
    change_into_datetime(i)
    
train_data.head()

In [None]:
#to access day of journey and add a column for that
train_data['Journey_day'] = train_data['Date_of_Journey'].dt.day

#to access month of journey and add a column for that
train_data['Journey_month'] = train_data['Date_of_Journey'].dt.month

train_data.drop('Date_of_Journey',axis=1, inplace=True)
train_data.head()

In [None]:
# Get the hour part of the timestamp and make column
def extract_hour(df, col):
    df[col+'_hour'] = df[col].dt.hour

# Get the minute part of the timestamp and make column    
def extract_minute(df, col):
    df[col+'_minute'] = df[col].dt.minute

# drop column    
def drop_column(df, col):
    df.drop(col, axis=1, inplace = True)
    
# apply above functions to departure and arrival time columns
for i in ['Dep_Time','Arrival_Time']:
    extract_hour(train_data,i)
    extract_minute(train_data,i)
    drop_column(train_data, i)
    
train_data.head()

In [None]:
# making list of the column entries
duration = list(train_data['Duration'])

# to split duration into hour and min, and make each entry as 'x'h 'y'm (0h 'y'm or 'x'h 0m if any one of the two isnt written)
x = '2h 50m'
x.split()
print(len(x.split()))

for i in range(len(duration)):
    if len(duration[i].split(' ')) == 2:
        pass
    else:
        if 'h' in duration[i]:
            duration[i] = duration[i] + ' 0m'
        else:
            duration[i] = '0h '+ duration[i]
            
train_data['Duration'] = duration # updatng duration column with updated list
train_data.tail()

In [None]:
print('2h 50m'.split(' ')[1][0:-1]) #extract hour and minute part 

def hour(x):
    return x.split(' ')[0][0:-1]

def minute(x):
    return x.split(' ')[1][0:-1]

train_data['Duration_hours'] = train_data['Duration'].apply(hour) # make columns
train_data['Duration_minutes'] = train_data['Duration'].apply(minute)

drop_column(train_data, 'Duration') # drop duration column
train_data.head()

In [None]:
train_data['Duration_hours'] = train_data['Duration_hours'].astype(int) # converting duration hour and minute column types 
train_data['Duration_minutes'] = train_data['Duration_minutes'].astype(int) # to int

object_cols = [] # finding categorical and numerical data columns
numeric_cols = []
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        object_cols.append(col)
    elif train_data[col].dtype != 'object':
        numeric_cols.append(col)
        
print(train_data.dtypes)
print(object_cols)
print(numeric_cols)

In [None]:
categorical = train_data[object_cols] #dataframe for all categoorical columns
categorical.head()
print(train_data['Airline'].value_counts()) # to find number of rows for each airline

plt.figure(figsize = (15,15)) #boxplot with seaborn (sns) for prices for different airlines, descending order
sns.boxplot(x = 'Airline', y = 'Price', data = train_data.sort_values('Price',ascending = False))

In [None]:
plt.figure(figsize = (15,15)) # boxplot for total stops varying with price in dec. order
sns.boxplot(x = 'Total_Stops', y = 'Price', data = train_data.sort_values('Price',ascending = False))

In [None]:
Airline = pd.get_dummies(categorical['Airline'],drop_first = True) # one hot enccoding for airline
Airline.head()

In [None]:
print(categorical['Source'].value_counts())

plt.figure(figsize = (15,15)) # boxplot for source locations varying with price in dec. order
sns.boxplot(x = 'Source', y = 'Price', data = train_data.sort_values('Price',ascending = False))

In [None]:
Source = pd.get_dummies(categorical['Source'],drop_first = True) # one hot encoding for source
Source.head()

In [None]:
print(categorical['Destination'].value_counts())

plt.figure(figsize = (15,15)) # boxplot for destination locations varying with price in dec. order
sns.boxplot(x = 'Destination', y = 'Price', data = train_data.sort_values('Price',ascending = False))

Destination = pd.get_dummies(categorical['Destination'],drop_first = True) # one hot enccoding for destination
Destination.head()

In [None]:
# to split the route into locations through '→' and make each stop into a column
categorical['Route_1'] = categorical['Route'].str.split('→').str[0]
categorical['Route_2'] = categorical['Route'].str.split('→').str[1]
categorical['Route_3'] = categorical['Route'].str.split('→').str[2]
categorical['Route_4'] = categorical['Route'].str.split('→').str[3]
categorical['Route_5'] = categorical['Route'].str.split('→').str[4]
drop_column(categorical,'Route') # dropping route column
categorical.head()

In [None]:
print(categorical.isnull().sum()) # to find all missing values per column for categorical
print('\n')

for i in ['Route_3', 'Route_4', 'Route_5']:
    categorical[i].fillna('None',inplace = True) # replacing those missing values with 'None'
    
print(categorical.columns)
print('\n')

for i in categorical.columns:
    print('{} has total {} categories'.format(i,len(categorical[i].value_counts()))) # showing number of categories for each 
                                                                                        #categorical column

In [None]:
from sklearn.preprocessing import LabelEncoder # to label encode the route columns
encoder = LabelEncoder()
for i in ['Route_1', 'Route_2', 'Route_3', 'Route_4', 'Route_5']:
    categorical[i] = encoder.fit_transform(categorical[i])
    

print(categorical['Additional_Info'].value_counts())
drop_column(categorical,'Additional_Info')
categorical.head()

In [None]:
categorical['Total_Stops'].unique()
dict = {'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4} # to assign integer for each category of total stops 
                                                                            #column as per number of stops through dictionary
categorical['Total_Stops'] = categorical['Total_Stops'].map(dict) #mapping the dict to the column
categorical.head()

In [None]:
data_train = pd.concat([categorical,Airline,Source,Destination,train_data[numeric_cols]],axis=1)
# to concatenate the categorical columns with numerical columns and the onehot encoded columns, and then dropping original

drop_column(data_train,'Airline')
drop_column(data_train,'Source')
drop_column(data_train,'Destination')
pd.set_option('display.max_columns',35) #setting limit of columns displayed
data_train.head()

In [None]:
def plot(df, col): # to make distribution and boxplots for price column to find outliers
    fig,(ax1,ax2) = plt.subplots(2,1)
    sns.distplot(df[col],ax = ax1)
    sns.boxplot(df[col],ax = ax2)
    
plot(data_train,'Price')

In [None]:
data_train['Price'] = np.where(data_train['Price']>=40000,data_train['Price'].median(),data_train['Price'])
# replacing outliers with the median (2nd argument), here outliers are where price goes beyond 40000 (1st argument)
# if price is under 40000, then no change (3rd argument) 
plot(data_train,'Price')



In [None]:
#Selecting independent and dependent features
X = data_train.drop('Price',axis=1)
Y = data_train['Price']

Y.head()
from sklearn.feature_selection import mutual_info_classif # to find dependency between feature matrix (X) and dependent var (y)
mutual_info_classif(X, Y)

In [None]:
imp = pd.DataFrame(mutual_info_classif(X, Y),index = X.columns) # to make dataframe for dependency pertaining to each feature
imp.columns = ['Importance']
imp.sort_values(by = 'Importance',ascending = False)
# in output, top 3-4 features have highest dependency with price, so they shall only be used for machine learning model

In [None]:
import pickle
from sklearn import metrics
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2) # split into training and test sets

def predict (ml_model, dump): # method to use for any machine learning model to fit the data, make predictions, and give scores
    model = ml_model.fit(X_train, Y_train)
    print('Training score: {}'.format(model.score(X_train, Y_train)))
    predictions = model.predict(X_test)
    print('Predictions are {}'.format(predictions))
    print('\n')
    r2score = metrics.r2_score(Y_test, predictions)
    print('R2 score is {}'.format(r2score))
    
    print('MAE: ',metrics.mean_absolute_error(Y_test, predictions))
    print('MSE: ',metrics.mean_squared_error(Y_test, predictions))
    print('RMSE: ',np.sqrt(metrics.mean_absolute_error(Y_test, predictions)))
    sns.distplot(Y_test - predictions)
    
    if dump == 1: # this function helps save the model to be reused for later
        file = open('C:/Users/abhin/OneDrive - BITS Pilani K K Birla Goa Campus/BITS Goa academic material/Coursera and Udemy documents/Udemy/All ML material/1..Flight_Price--_ Machine Learning/model.pkl','wb')
        pickle.dump(model, file)

In [None]:
from sklearn.ensemble import RandomForestRegressor
predict(RandomForestRegressor(),1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
predict(LinearRegression(),0)


In [None]:
predict(KNeighborsRegressor(),0)


In [None]:
predict(DecisionTreeRegressor(),0)

In [None]:
from sklearn.model_selection import RandomizedSearchCV #Hypertuning approach

reg_rf = RandomForestRegressor()
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 6)] #list comprehension code
max_depth = [int(x) for x in np.linspace(start = 5, stop = 30, num = 4)] #list comprehension code

random_grid = {'n_estimators': n_estimators,
                'max_features': ['auto','sqrt'], # number of features to consider at every split of decision tree
                 'max_depth': max_depth,         # max number of layers in decision tree
                'min_samples_split': [5,10,15,100]} # min number of samples required to split node
                                         
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid, cv = 3, verbose = 2, n_jobs = -1)
rf_random.fit(X_train, Y_train)

In [None]:
print(rf_random.best_params_)
preds = rf_random.predict(X_test)
plt.figure()
sns.distplot(Y_test - preds)
print(metrics.r2_score(Y_test, preds))