In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Importing dataset**


1. Since data is in form of excel file we have to use pandas read_excel to load the data
2. After loading it is important to check null values in a column or a row
3. If it is present then following can be done,
4. Filling NaN values with mean, median and mode using fillna() method
5. If Less missing values, we can drop it as well

In [None]:
train_data=pd.read_csv('../input/flight-p/Data_Train.csv')
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.shape

# Dealing With Missing Values

In [None]:
train_data.isnull().sum()

In [None]:
train_data.dropna(inplace=True) #drop missing value in my data, pass inplace parameter to update my dataframe

In [None]:
train_data.isna().sum() #to cross chheck the missing values

# Data Cleaning to make our Data more ready for analysis as well as modeling purpose

In [None]:
train_data.dtypes #check data type of each and evey variable/each and every column available in dataframe

# From description we can see that Date_of_Journey is a object data type,
 Therefore, we have to convert this datatype into timestamp so as to use this column properly for prediction,bcz our 
 model will not be able to understand Theses string values,it just understand Time-stamp
For this we require pandas to_datetime to convert object data type to datetime dtype.


dt.day method will extract only day of that date

dt.month method will extract only month of that date

In [None]:
def change_into_datetime(col):
    train_data[col]=pd.to_datetime(train_data[col]) #change data type to date time

In [None]:
train_data.columns

In [None]:
for i in ['Date_of_Journey','Dep_Time', 'Arrival_Time']:
    change_into_datetime(i)

In [None]:
train_data.dtypes

In [None]:
train_data['Journey_day']=train_data['Date_of_Journey'].dt.day #Access day from Date_of_Journey
train_data['journey_month']=train_data['Date_of_Journey'].dt.month #Access month from Date_of_Journey

In [None]:
train_data.head()

In [None]:
#I have exctracted Journey_day and Journey_month from Date_of_Journey (converted Date_of_Journey column into integers) hence I am dopping Date_of_Journey column.

train_data.drop('Date_of_Journey', axis=1, inplace=True) #axis=1 to dop data vertically

In [None]:
train_data.head()

In [None]:
def extract_hour(df,col):
    df[col+"_hour"]=df[col].dt.hour

In [None]:
def extract_min(df,col):
    df[col+"_minute"]=df[col].dt.minute

In [None]:
def drop_column(df,col):
    df.drop(col,axis=1,inplace=True)

In [None]:
# Departure time is when a plane leaves the gate. 
# Similar to Date_of_Journey we can extract values from Dep_Time
extract_hour(train_data,'Dep_Time')

In [None]:
# Extracting Minutes
extract_min(train_data,'Dep_Time')

In [None]:
# Now we can drop Dep_Time as it is of no use
drop_column(train_data,'Dep_Time')

In [None]:
train_data.head()

In [None]:
# Arrival time is when the plane pulls up to the gate.
# Similar to Date_of_Journey we can extract values from Arrival_Time

# Extracting Hours
extract_hour(train_data,'Arrival_Time')

# Extracting minutes
extract_min(train_data,'Arrival_Time')

# Now we can drop Arrival_Time as it is of no use
drop_column(train_data,'Arrival_Time')

In [None]:
train_data.head()

In [None]:
x='2h 50m'
x.split(' ') #o/p is in the form of list

In [None]:
x='2h 50m'
len(x.split(' '))

In [None]:
duration=list(train_data['Duration'])

for i in range(len(duration)):
    if len(duration[i].split(' '))==2:
        pass
    else:
        if 'h' in duration[i]:                   # Check if duration contains only hour
            duration[i]=duration[i] + ' 0m'      # Adds 0 minute
        else:
            duration[i]='0h '+ duration[i]       # if duration contains only second, Adds 0 hour

In [None]:
train_data['Duration']=duration
train_data.head()

In [None]:
'2h 50m'.split(' ')[0]

In [None]:
'2h 50m'.split(' ')[1]

In [None]:
'2h 50m'.split(' ')[1][0:-1]

In [None]:
def hour(x):
    return x.split(' ')[0][0:-1]

In [None]:
def min(x):
    return x.split(' ')[1][0:-1]

In [None]:
train_data['Duration_hours']=train_data['Duration'].apply(hour)
train_data['Duration_mins']=train_data['Duration'].apply(min)

In [None]:
train_data.drop('Duration',axis=1,inplace=True)
train_data.head()

In [None]:
train_data.dtypes

In [None]:
train_data['Duration_hours']=train_data['Duration_hours'].astype(int)
train_data['Duration_mins']=train_data['Duration_mins'].astype(int)

In [None]:
train_data.dtypes

In [None]:
cat_col=[col for col in train_data.columns if train_data[col].dtype=='O'] #Categorical Data
cat_col

In [None]:
cont_col=[col for col in train_data.columns if train_data[col].dtype!='O'] #continuous feature
cont_col 

# Handling Categorical Data

**We are using 2 main Encoding Techniques to convert Categorical data into some numerical format**

**Nominal data --> data are not in any order --> OneHotEncoder is used in this case**

**Ordinal data --> data are in order --> LabelEncoder is used in this case**

In [None]:
categorical=train_data[cat_col]
categorical.head()

In [None]:
categorical['Airline'].value_counts() #count of each and every feature available in Airline

# Airline vs price analysis

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x='Airline', y='Price', data=train_data.sort_values('Price',ascending=False))

# Conclusion--> From graph we can see that Jet Airways Business have the highest Price., Apart from the first Airline almost all are having similar median

# Perform Total_Stops vs Price Analysis

In [None]:
train_data.head()

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(y='Price',x='Total_Stops',data=train_data.sort_values('Price',ascending=False))

In [None]:
len(categorical['Airline'].unique())

In [None]:
# As Airline is Nominal Categorical data we will perform OneHotEncoding
Airline=pd.get_dummies(categorical['Airline'], drop_first=True)
Airline.head()

In [None]:
categorical['Source'].value_counts()

In [None]:
# Source vs Price

plt.figure(figsize=(15,5))
sns.catplot(y='Price',x='Source',data=train_data.sort_values('Price',ascending=False),kind='boxen')

In [None]:
# As Source is Nominal Categorical data we will perform OneHotEncoding


Source=pd.get_dummies(categorical['Source'], drop_first=True)
Source.head()

In [None]:
categorical['Destination'].value_counts()

In [None]:
# As Destination is Nominal Categorical data we will perform OneHotEncoding

Destination=pd.get_dummies(categorical['Destination'], drop_first=True)
Destination.head()

In [None]:
categorical['Route']

In [None]:
categorical['Route_1']=categorical['Route'].str.split('→').str[0]
categorical['Route_2']=categorical['Route'].str.split('→').str[1]
categorical['Route_3']=categorical['Route'].str.split('→').str[2]
categorical['Route_4']=categorical['Route'].str.split('→').str[3]
categorical['Route_5']=categorical['Route'].str.split('→').str[4]

In [None]:
categorical.head()

In [None]:
import warnings 
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
categorical.head()

In [None]:
#now extract how many categories in each cat_feature
for feature in categorical.columns:
    print('{} has total {} categories \n'.format(feature,len(categorical[feature].value_counts())))

In [None]:
### as we will see we have lots of features in Route , one hot encoding will not be a better option lets appply Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
categorical.columns

In [None]:
for i in ['Route_1', 'Route_2', 'Route_3', 'Route_4','Route_5']:
    categorical[i]=encoder.fit_transform(categorical[i])

In [None]:
categorical.head()

In [None]:
# Additional_Info contains almost 80% no_info,so we can drop this column
# we can drop Route as well as we have pre-process that column
    
drop_column(categorical,'Route')
drop_column(categorical,'Additional_Info')

In [None]:
categorical.head()

In [None]:
categorical['Total_Stops'].value_counts()

In [None]:
categorical['Total_Stops'].unique()

In [None]:
# As this is case of Ordinal Categorical type we perform LabelEncoder
# Here Values are assigned with corresponding key

dict={'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4}

In [None]:
categorical['Total_Stops']=categorical['Total_Stops'].map(dict)

In [None]:
categorical.head()

In [None]:
train_data[cont_col]

In [None]:
# Concatenate dataframe --> categorical + Airline + Source + Destination

data_train=pd.concat([categorical,Airline,Source,Destination,train_data[cont_col]],axis=1)
data_train.head()

In [None]:
drop_column(data_train,'Airline')
drop_column(data_train,'Source')
drop_column(data_train,'Destination')

In [None]:
data_train.head()

In [None]:
pd.set_option('display.max_columns',35)

In [None]:
data_train.head()

In [None]:
data_train.columns

# outlier detection

In [None]:
def plot(df,col):
    fig,(ax1,ax2)=plt.subplots(2,1)
    sns.distplot(df[col],ax=ax1)
    sns.boxplot(df[col],ax=ax2)
    

In [None]:
plt.figure(figsize=(30,20))
plot(data_train,'Price')

# dealing with Outliers

In [None]:
data_train['Price']=np.where(data_train['Price']>=40000,data_train['Price'].median(),data_train['Price'])

In [None]:
plt.figure(figsize=(30,20))
plot(data_train,'Price')

In [None]:
### separate your independent & dependent data

In [None]:
X=data_train.drop('Price',axis=1)
X.head()

In [None]:
y=data_train['Price']
y

In [None]:
##type(X)

In [None]:
##type(y)

##X.isnull().sum()

In [None]:
##y.isnull().sum()

In [None]:
#### as now we dont have any missing value in data, we can definitely go ahead with Feature Selection

# Feature Selection
    Finding out the best feature which will contribute and have good relation with target variable. 
    
# Why to apply Feature Selection?
    To select important features to get rid of curse of dimensionality ie..to get rid of duplicate features

In [None]:
###np.array(X)

In [None]:
##np.array(y)

# I wanted to find mutual information scores or matrix to get to know about the relationship between all features.

# Feature Selection using Information Gain,

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
###mutual_info_classif(np.array(X),np.array(y))

In [None]:
# mutual_info_classif()

In [None]:
###mutual_info_classif(np.array(X),np.array(y))

In [None]:
X.dtypes

In [None]:
mutual_info_classif(X,y)

In [None]:
imp=pd.DataFrame(mutual_info_classif(X,y),index=X.columns)
imp

In [None]:
imp.columns=['importance']
imp.sort_values(by='importance',ascending=False)

# Random Forest 
#### split dataset into train & test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn import metrics
##dump your model using pickle so that we will re-use
import pickle
def predict(ml_model,dump):
    model=ml_model.fit(X_train,y_train)
    print('Training score : {}'.format(model.score(X_train,y_train)))
    y_prediction=model.predict(X_test)
    print('predictions are: \n {}'.format(y_prediction))
    print('\n')
    r2_score=metrics.r2_score(y_test,y_prediction)
    print('r2 score: {}'.format(r2_score))
    print('MAE:',metrics.mean_absolute_error(y_test,y_prediction))
    print('MSE:',metrics.mean_squared_error(y_test,y_prediction))
    print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_prediction)))
    sns.distplot(y_test-y_prediction)
    
    if dump==1:
        ##dump your model using pickle so that we will re-use
        file=open('model.pkl','wb')
        pickle.dump(model,file)

#### import randomforest class

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
#predict(RandomForestRegressor())  #execute this without dump

In [None]:
predict(RandomForestRegressor(),1)

#### play with multiple Algorithms

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
predict(DecisionTreeRegressor(),0)

In [None]:
predict(LinearRegression(),0)

In [None]:
predict(KNeighborsRegressor(),0)  #When dataset is huge don't use knn

#### Hyperparameter Tuning
    1.Choose following method for hyperparameter tuning
        a.RandomizedSearchCV --> Fast way to Hypertune model
        b.GridSearchCV--> Slow way to hypertune my model
    
    2.Assign hyperparameters in form of dictionary
    3.Fit the model
    4.Check best paramters and best score

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=6)]

# Number of features to consider at every split
max_features=['auto','sqrt']

# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(5,30,num=4)]

# Minimum number of samples required to split a node
min_samples_split=[5,10,15,100]

In [None]:
# Create the random grid

random_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
'max_depth':max_depth,
    'min_samples_split':min_samples_split
}

In [None]:
random_grid

In [None]:
### initialise your estimator
reg_rf=RandomForestRegressor()

In [None]:
# Random search of parameters, using 3 fold cross validation

rf_random=RandomizedSearchCV(estimator=reg_rf,param_distributions=random_grid,cv=3,verbose=2,n_jobs=-1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
prediction=rf_random.predict(X_test)

In [None]:
sns.distplot(y_test-prediction)

In [None]:
metrics.r2_score(y_test,prediction)

In [None]:
print('MAE',metrics.mean_absolute_error(y_test,prediction))
print('MSE',metrics.mean_squared_error(y_test,prediction))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,prediction)))

In [None]:
# !pip install pickle

In [None]:
import pickle

In [None]:
# open a file, where you want to store the data
file=open('rf_random.pkl','wb')

In [None]:
# dump information to that file
pickle.dump(rf_random,file)

In [None]:
model=open('rf_random.pkl','rb')
forest=pickle.load(model)

In [None]:
y_prediction=forest.predict(X_test)

In [None]:
y_prediction

In [None]:
metrics.r2_score(y_test,y_prediction)