In [125]:
# Base libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Sklearn models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier

# Sklearn base
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

In [126]:
airlines_df = pd.read_excel('Data_Train.xlsx')

In [346]:
prue = pd.read_excel('Data_Train.xlsx')

In [128]:
airlines_df.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [129]:
def change_into_datetime(df):
    """
        Function to convert variables to datetime
    """
    lst = ['Date_of_Journey', 'Dep_Time', 'Arrival_Time']
    for x in lst:
        df[x] = pd.to_datetime(df[x])

In [130]:
change_into_datetime(airlines_df)

In [131]:
airlines_df.dtypes

Airline                    object
Date_of_Journey    datetime64[ns]
Source                     object
Destination                object
Route                      object
Dep_Time           datetime64[ns]
Arrival_Time       datetime64[ns]
Duration                   object
Total_Stops                object
Additional_Info            object
Price                       int64
dtype: object

In [132]:
def extract_hour(df, cols):
    for x in cols:
        df[x+'_hour'] = df[x].dt.hour
        df[x+'_minute'] = df[x].dt.minute

In [133]:
extract_hour(airlines_df, ['Dep_Time', 'Arrival_Time'])

In [134]:
airlines_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Dep_Time_hour,Dep_Time_minute,Arrival_Time_hour,Arrival_Time_minute
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2021-07-27 22:20:00,2021-03-22 01:10:00,2h 50m,non-stop,No info,3897,22,20,1,10
1,Air India,2019-01-05,Kolkata,Banglore,CCU → IXR → BBI → BLR,2021-07-27 05:50:00,2021-07-27 13:15:00,7h 25m,2 stops,No info,7662,5,50,13,15
2,Jet Airways,2019-09-06,Delhi,Cochin,DEL → LKO → BOM → COK,2021-07-27 09:25:00,2021-06-10 04:25:00,19h,2 stops,No info,13882,9,25,4,25
3,IndiGo,2019-12-05,Kolkata,Banglore,CCU → NAG → BLR,2021-07-27 18:05:00,2021-07-27 23:30:00,5h 25m,1 stop,No info,6218,18,5,23,30
4,IndiGo,2019-01-03,Banglore,New Delhi,BLR → NAG → DEL,2021-07-27 16:50:00,2021-07-27 21:35:00,4h 45m,1 stop,No info,13302,16,50,21,35


In [135]:
x = ['2h 30m','30m', '1h']

In [178]:
def prueba(df, col):
    """
    This function performs these actions:
        First: create a list from the indicated dataframe and its column or columns
        
        Second: iterates over the elements of that list and performs a split if the 
                length is two passes but if it is not two add 0h if there is one m 
                or 0m if there is one h
        Third: replace column elements with new ones
        
        Quarter: create two new columns for the hours and for the minutes 
        
    Return: returns a dataframe
    """
    # First 
    lst = list(df[col])
    
    # Second
    for i in range(len(lst)):
        if len(lst[i].split(' ')) == 2:
            pass
        else:
            lst[i] = lst[i] + ' 0m' if 'h' in lst[i] else '0h ' + lst[i]
    
    # Third
    df[col] = lst
    
    # Quarter
    df[col  + '_hours'] = df[col].apply(lambda x: x.split(' ')[0][0:-1]).astype('int64')
    df[col  + '_mins'] = df[col].apply(lambda x: x.split(' ')[1][0:-1]).astype('int64')


In [185]:
prueba(prueba3, 'Duration')

In [186]:
prueba3.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
Duration_hours      int64
Duration_mins       int64
dtype: object

In [140]:
new = airlines_df['Duration'].apply(lambda x: x.split(' ')[1][0:-1])

In [141]:
airlines_df['Journey_day'] = airlines_df['Date_of_Journey'].dt.day
airlines_df['Journey_month'] = airlines_df['Date_of_Journey'].dt.month

In [143]:
airlines_df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Dep_Time_hour,Dep_Time_minute,Arrival_Time_hour,Arrival_Time_minute,Duration_hours,Duration_mins,Journey_day,Journey_month
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2021-07-27 22:20:00,2021-03-22 01:10:00,2h 50m,non-stop,No info,3897,22,20,1,10,2,50,24,3
1,Air India,2019-01-05,Kolkata,Banglore,CCU → IXR → BBI → BLR,2021-07-27 05:50:00,2021-07-27 13:15:00,7h 25m,2 stops,No info,7662,5,50,13,15,7,25,5,1
2,Jet Airways,2019-09-06,Delhi,Cochin,DEL → LKO → BOM → COK,2021-07-27 09:25:00,2021-06-10 04:25:00,19h 0m,2 stops,No info,13882,9,25,4,25,19,0,6,9
3,IndiGo,2019-12-05,Kolkata,Banglore,CCU → NAG → BLR,2021-07-27 18:05:00,2021-07-27 23:30:00,5h 25m,1 stop,No info,6218,18,5,23,30,5,25,5,12
4,IndiGo,2019-01-03,Banglore,New Delhi,BLR → NAG → DEL,2021-07-27 16:50:00,2021-07-27 21:35:00,4h 45m,1 stop,No info,13302,16,50,21,35,4,45,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,2019-09-04,Kolkata,Banglore,CCU → BLR,2021-07-27 19:55:00,2021-07-27 22:25:00,2h 30m,non-stop,No info,4107,19,55,22,25,2,30,4,9
10679,Air India,2019-04-27,Kolkata,Banglore,CCU → BLR,2021-07-27 20:45:00,2021-07-27 23:20:00,2h 35m,non-stop,No info,4145,20,45,23,20,2,35,27,4
10680,Jet Airways,2019-04-27,Banglore,Delhi,BLR → DEL,2021-07-27 08:20:00,2021-07-27 11:20:00,3h 0m,non-stop,No info,7229,8,20,11,20,3,0,27,4
10681,Vistara,2019-01-03,Banglore,New Delhi,BLR → DEL,2021-07-27 11:30:00,2021-07-27 14:10:00,2h 40m,non-stop,No info,12648,11,30,14,10,2,40,3,1


### Handle Categorical Data & Feature Encoding

In [144]:
categorical_features = [col for col in airlines_df.select_dtypes(exclude=np.number)]
numeric_features = [col for col in airlines_df.select_dtypes(exclude=object).columns]

In [145]:
numeric_features

['Date_of_Journey',
 'Dep_Time',
 'Arrival_Time',
 'Price',
 'Dep_Time_hour',
 'Dep_Time_minute',
 'Arrival_Time_hour',
 'Arrival_Time_minute',
 'Duration_hours',
 'Duration_mins',
 'Journey_day',
 'Journey_month']

In [146]:
categorical_features

['Airline',
 'Date_of_Journey',
 'Source',
 'Destination',
 'Route',
 'Dep_Time',
 'Arrival_Time',
 'Duration',
 'Total_Stops',
 'Additional_Info']

In [147]:
airlines_df.shape

(10683, 19)

In [None]:
## Nominal Data -- onehot
## Ordinal Data -- LabelEncoder

In [156]:
airlines_df['Additional_Info'].value_counts()

No info                         8345
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
2 Long layover                     1
1 Short layover                    1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [167]:
airlines_df['Additional_Info'] = 
airlines_df['Additional_Info'].apply(lambda x: x.replace('No Info', 'No info') if x == 'No Info' else x)

In [168]:
airlines_df['Additional_Info'].value_counts()

No info                         8348
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
2 Long layover                     1
1 Short layover                    1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [200]:
def additional_info_transform(df, col):
    """
    this function transforms the elements of the column that were repeated but were written differently
    
    """
    
    df[col] = df[col].apply(lambda x: x.replace('No Info', 'No info') if x == 'No Info' else x)
    

In [201]:
additional_info_transform(prueba2, 'Additional_Info')

In [202]:
prueba2['Additional_Info'].value_counts()

No info                         8348
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
2 Long layover                     1
1 Short layover                    1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [173]:
airlines_df = additional_info_transform(airlines_df, 'Additional_Info')

In [177]:
airlines_df['Additional_Info'].value_counts()

No info                         8348
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
2 Long layover                     1
1 Short layover                    1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [260]:
def route_transform(df, col):
    """
    
    """

    for x in range(5):
        df[col+f'_{x+1}'] = df[col].str.split('→').str[x]
        
        
    #df[col+'_1'] = df['Route'].str.split('→').str[0]
    #df[col+'_2'] = df['Route'].str.split('→').str[1]
    #df[col+'_3'] = df['Route'].str.split('→').str[2]
    #df[col+'_4'] = df['Route'].str.split('→').str[3]
    #df[col+'_5'] = df['Route'].str.split('→').str[4]
    
    #for x in df[col+'_3', col+'_4', col+'_5']:
     #   df[x].fillna('None')

In [259]:
for x in range(3):
    print(x)

0
1
2


In [261]:
route_transform(prueba2, 'Route')

In [262]:
prueba2

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Duration_hours,Duration_mins,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897,2,50,BLR,DEL,,,
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,7,25,CCU,IXR,BBI,BLR,
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h 0m,2,No info,13882,19,0,DEL,LKO,BOM,COK,
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,5,25,CCU,NAG,BLR,,
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,4,45,BLR,NAG,DEL,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,0,No info,4107,2,30,CCU,BLR,,,
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,0,No info,4145,2,35,CCU,BLR,,,
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h 0m,0,No info,7229,3,0,BLR,DEL,,,
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,0,No info,12648,2,40,BLR,DEL,,,


In [203]:
prueba2.isnull().sum()

Airline                0
Date_of_Journey        0
Source                 0
Destination            0
Route                  1
Dep_Time               0
Arrival_Time           0
Duration               0
Total_Stops            1
Additional_Info        0
Price                  0
Duration_hours         0
Duration_mins          0
Route_1                1
Route_2                1
Route_3             3492
Route_4             9117
Route_5            10637
dtype: int64

In [209]:
prueba2.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'Duration_hours', 'Duration_mins',
       'Route_1', 'Route_2', 'Route_3', 'Route_4', 'Route_5'],
      dtype='object')

In [211]:
prueba2[['Route_3', 'Route_4', 'Route_5']].fillna('None')

Unnamed: 0,Route_3,Route_4,Route_5
0,,,
1,BBI,BLR,
2,BOM,COK,
3,BLR,,
4,DEL,,
...,...,...,...
10678,,,
10679,,,
10680,,,
10681,,,


In [215]:
for x in ['Route_3', 'Route_4', 'Route_5']:
    prueba2[x].fillna('None')

In [221]:
prueba2[['Route_3', 'Route_4', 'Route_5']] = prueba2[['Route_3', 'Route_4', 'Route_5']].fillna(value='None')

In [222]:
prueba2.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
Duration_hours     0
Duration_mins      0
Route_1            1
Route_2            1
Route_3            0
Route_4            0
Route_5            0
dtype: int64

In [236]:
def clean_df(df):
    return df.drop(columns=['Date_of_Journey','Dep_Time','Arrival_Time','Duration','Route']).dropna()

In [237]:
clean_df(prueba2)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Duration_hours,Duration_mins,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,Banglore,New Delhi,non-stop,No info,3897,2,50,BLR,DEL,,,
1,Air India,Kolkata,Banglore,2 stops,No info,7662,7,25,CCU,IXR,BBI,BLR,
2,Jet Airways,Delhi,Cochin,2 stops,No info,13882,19,0,DEL,LKO,BOM,COK,
3,IndiGo,Kolkata,Banglore,1 stop,No info,6218,5,25,CCU,NAG,BLR,,
4,IndiGo,Banglore,New Delhi,1 stop,No info,13302,4,45,BLR,NAG,DEL,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,non-stop,No info,4107,2,30,CCU,BLR,,,
10679,Air India,Kolkata,Banglore,non-stop,No info,4145,2,35,CCU,BLR,,,
10680,Jet Airways,Banglore,Delhi,non-stop,No info,7229,3,0,BLR,DEL,,,
10681,Vistara,Banglore,New Delhi,non-stop,No info,12648,2,40,BLR,DEL,,,


In [238]:
prueba2.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Duration_hours,Duration_mins,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,2,50,BLR,DEL,,,
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,7,25,CCU,IXR,BBI,BLR,
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h 0m,2 stops,No info,13882,19,0,DEL,LKO,BOM,COK,
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,5,25,CCU,NAG,BLR,,
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,4,45,BLR,NAG,DEL,,


In [242]:
prueba2['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

In [244]:
def total_stops_transform(df, col):
    dic = {'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4}
    df[col] = df[col].map(dic)

In [246]:
total_stops_transform(prueba2, 'Total_Stops')

In [247]:
prueba2

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Duration_hours,Duration_mins,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897,2,50,BLR,DEL,,,
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,7,25,CCU,IXR,BBI,BLR,
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h 0m,2,No info,13882,19,0,DEL,LKO,BOM,COK,
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,5,25,CCU,NAG,BLR,,
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,4,45,BLR,NAG,DEL,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,0,No info,4107,2,30,CCU,BLR,,,
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,0,No info,4145,2,35,CCU,BLR,,,
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h 0m,0,No info,7229,3,0,BLR,DEL,,,
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,0,No info,12648,2,40,BLR,DEL,,,


In [249]:
from sklearn.preprocessing import LabelEncoder

In [252]:
def label_encoder(df):
    encoder = LabelEncoder()
    for x in ['Route_1','Route_2','Route_3','Route_4','Route_5']:
        df[x] = encoder.fit_transform(df[x])

In [253]:
label_encoder(prueba2)

In [254]:
prueba2

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Duration_hours,Duration_mins,Route_1,Route_2,Route_3,Route_4,Route_5
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897,2,50,0,13,29,13,5
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,7,25,2,25,1,3,5
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h 0m,2,No info,13882,19,0,3,32,4,5,5
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,5,25,2,34,3,13,5
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,4,45,0,34,8,13,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,0,No info,4107,2,30,2,5,29,13,5
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,0,No info,4145,2,35,2,5,29,13,5
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h 0m,0,No info,7229,3,0,0,13,29,13,5
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,0,No info,12648,2,40,0,13,29,13,5


In [263]:
from sklearn.preprocessing import OneHotEncoder

In [285]:
def one_hot(df):
    enc = OneHotEncoder(handle_unknown='ignore')# passing bridge-types-cat column (label encoded values of bridge_types)
    enc_df = pd.DataFrame(enc.fit_transform(prueba2[['Airline','Source','Destination']]).toarray())# merge with main df bridge_df on key values
    prueba2 = prueba2.join(enc_df)

In [269]:
sub = prueba2[['Airline','Source','Destination']]

In [270]:
sub.head(3)

Unnamed: 0,Airline,Source,Destination
0,IndiGo,Banglore,New Delhi
1,Air India,Kolkata,Banglore
2,Jet Airways,Delhi,Cochin


In [276]:
enc = OneHotEncoder(sparse=False)

In [277]:
enc_df = pd.DataFrame(enc.fit_transform(sub[['Airline','Source','Destination']]).toarray())

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [279]:
enc = OneHotEncoder(handle_unknown='ignore')# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(prueba2[['Airline','Source','Destination']]).toarray())# merge with main df bridge_df on key values
prueba2 = prueba2.join(enc_df)
prueba2

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,...,13,14,15,16,17,18,19,20,21,22
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h 0m,2,No info,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,0,No info,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,0,No info,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h 0m,0,No info,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,0,No info,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [292]:
def one_hot(df, cols):
    enc = OneHotEncoder(handle_unknown='ignore')# passing bridge-types-cat column (label encoded values of bridge_types)
    enc_df = pd.DataFrame(enc.fit_transform(df[cols]).toarray())# merge with main df bridge_df on key values
    df = df.join(enc_df)

In [295]:
pruebass = one_hot(df=prueba3, cols=['Airline','Source','Destination'])

In [296]:
pruebass

In [299]:
dum_df = pd.get_dummies(prueba3, columns=['Airline','Source','Destination'], prefix=["Type_is"] )# merge with main df bridge_df on key values
prueba4 = prueba4.join(dum_df)
prueba4

ValueError: Length of 'prefix' (1) did not match the length of the columns being encoded (3).

In [300]:
dum_df = pd.get_dummies(bridge_df, columns=["Bridge_Types"], prefix=["Type_is"] )# merge with main df bridge_df on key values
bridge_df = bridge_df.join(dum_df)
bridge_df

NameError: name 'bridge_df' is not defined

In [315]:
def one_hot(df, cols):
    for x in cols:
        dumm = pd.get_dummies(df[x], drop_first=True)
        df = df.join(dumm)

In [316]:
one_hot(prueba1, cols=['Airline','Source','Destination'])

ValueError: columns overlap but no suffix specified: Index(['Delhi', 'Kolkata'], dtype='object')

In [314]:
prueba1

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [321]:
def one_hot(df):
    dummie_airline = pd.get_dummies(df['Airline'], drop_first=True)
    dummie_source = pd.get_dummies(df['Source'], drop_first=True)
    dummie_destination = pd.get_dummies(df['Destination'], drop_first=True)
    
    df = df.join(dummie_airline)

In [317]:
prueba3 = prueba3.join(dummie_airline)

In [322]:
one_hot(prueba1)

In [323]:
prueba1

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [325]:
pru.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [None]:
class clean():
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        X_ = X.copy()
        
        
        
    def transform(self, X):
        

In [326]:
import numpy as np
from sklearn.impute import SimpleImputer

In [328]:

imp=SimpleImputer(missing_values='None', strategy=None)

idf=pd.DataFrame(imp.fit_transform(pru))

idf.columns=pru.columns

idf.index=pru.index

ValueError: Can only use these strategies: ['mean', 'median', 'most_frequent', 'constant']  got strategy=None

In [334]:
class Clean_df():
        
    def __init__(self):
        self.column_means = np.nan
        pass
    
    def fit(self, X, y=None):
        X_ = X.copy()
        for column in X_.select_dtypes(exclude=object):
            self.column_means[column] = X_[column].isna()
            
        return self
        
    
    def transform(self, X):
        X_ = X.copy()
        for column in X_.select_dtypes(exclude=np.number):
            X_[column].fillna(self.column_means[column])
            
        return X_

In [335]:
Clean_df().fit(X_train, y_train).transform(X_test)

TypeError: string indices must be integers

In [331]:
data_wrangling().fit(X_train, y_train).transform(X_test)

NameError: name 'data_wrangling' is not defined

In [332]:
X, y = pru.drop(columns="Price"), pru['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [338]:
#pru.fillna('None')

In [339]:
class clean():
    
    def __init__(self):
        self.fill = 'None'
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X):
        X_ = X.copy()
        
        for column in X_.select_dtypes(exclude=np.number):
            X_[column].fillna(self.fill[column])
            
        return X_

In [340]:
clean().fit(X_train, y_train).transform(X_test)

AttributeError: 'NoneType' object has no attribute 'transform'

In [344]:
pd.set_option('display.max_columns', 35)

In [345]:
prueba2.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Duration_hours,Duration_mins,Route_1,Route_2,Route_3,Route_4,...,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897,2,50,BLR,DEL,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,7,25,CCU,IXR,BBI,BLR,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h 0m,2,No info,13882,19,0,DEL,LKO,BOM,COK,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,5,25,CCU,NAG,BLR,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,4,45,BLR,NAG,DEL,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [347]:
prue.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [349]:
len(prue.columns)

11