In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv as csv 
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import datetime
from matplotlib.ticker import FuncFormatter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing all data and combining it

In [None]:
#Defining a helper function to import, reformat the excel file and return a dataframe
def import_df(path, column, new_col_name, formatting=0, remove_unnamed=True):
    df=pd.read_excel(("/kaggle/input/airline-data-project-mit-1995-2019/Original MIT data/Original MIT data/"+path))
    if remove_unnamed==True:
        df=df.drop(["Unnamed: 1"],axis=1) #removing empty column
    else:
        pass
    df=df[4:] #deleting unwanted row
    df=df.transpose() #transposing data frame
    df=pd.DataFrame(df.iloc[1:,column])
    df["Year"]=np.arange(1995,2019,1) #make a series to fill in year column
    df.reset_index(drop=True, inplace=True) #reset index
    df.set_index("Year", inplace=True) #make year the new index
    df=df*(10**(formatting))
    df.rename(columns={(column+4):new_col_name}, inplace=True)#renaming column
    return df

In [None]:
#each dataframe will form a single column for the final dataframe

## Revenues

#Importing and creating a dataframe of all revenue related metric totals (Domestic + International + Others)
#total system revenue
total_revenue = import_df(path = 'Total revenue/System Total Operating Revenue.xls',column = 25, new_col_name = 'Total Operating Revenue ($)',formatting = 9)
#revenue per asm
rev_per_asm = import_df('Total revenue/System Total Revenue (Ex- Transport Related) per ASM.xls', 23, 'System Revenue Per ASM (exc-transport) ($ per ASM)', -2)
#System Passenger revenue
sys_pass_rev = import_df('Total revenue/System Passenger Revenue.xls', 25, "System Passenger Revenue ($)", 9 )
#PRASM
PRASM = import_df('Total revenue/System Passenger Revenue per Available Seat Mile (PRASM).xls', 23, 'System Passenger Revenue per ASM ($)', 9)
#System Passenger yield, revenue per mile
pass_yield = import_df('Total revenue/System Passenger Yield.xls', 23, 'System Passenger Yield ($ per RPM)', -2)
#System Total transport related revenue
trans_rev = import_df('Total revenue/System Total Transport Related Revenue.xls', 25, 'System Total Transport Related Revenue ($)', 9)
#Ancillary fees revenue
ancillary = import_df('Total revenue/Ancillary Fee Revenue.xls', 23, 'Ancillary Fees ($)', 3)
#all Revenues table
revenue=pd.concat([total_revenue,sys_pass_rev,trans_rev,PRASM,rev_per_asm,pass_yield,ancillary], axis=1)
revenue=revenue.astype('float64')


# Employee compensation

#full time employee equivalents
FT_equiv = import_df('Total employee compensation/Total Full-time Employee Equivalents.xls', 23, 'Full Time Employees Equivalents')
#full time non cockpit employee equivalents
FT_NC_equiv = import_df('Total employee compensation/Total Full-time Employee Non Cockpit Equivalents.xls', 23, 'Full Time Non Cockpit Employees Equivalents')
#all employees Avg wage
all_wages_avg = import_df('Total employee compensation/Average Annual Wages and Salaries - All Employees.xls', 24, 'All Employees Avg Wage ($)', )
#non cockpit employees Avg wage.
NC_wages_avg = import_df('Total employee compensation/Average Annual Wages and Salaries - All Non-Cockpit Employees.xls', 24, 'Non Cockpit Employees Avg Wage ($)')
#all employees avg benefits
all_benefits = import_df('Total employee compensation/Average Pension and Benefit Package - ALL EMPLOYEES.xls', 24, 'All Employees average benefits and pensions ($)')
#Non cockpit employees avg benefits
NC_benefits = import_df('Total employee compensation/Average Pension and Benefit Package - ALL NON-COCKPIT EMPLOYEES.xls', 24, 'Non-Cockpit Employees average benefits and pensions ($)')
#Pilot employee equivalents
pilot_equiv = import_df('Total employee compensation/Total Pilot and Copilot Employee Equivalents.xls', 23, 'Pilot and Co-Pilot equivalents')
#Pilot employee wages
pilot_wage = import_df('Total employee compensation/Average Annual Wages and Salaries - PILOT AND CO-PILOT PERSONNEL.xls', 24, 'Average Pilot and Co-Pilot wages ($)')
#flight attendant equivalents
FA_equiv = import_df('Total employee compensation/Total Flight Attendant Employee Equivalents.xls', 23,'Flight Attendant Employee Equivalents')
#flight attendant wages
FA_wage = import_df('Total employee compensation/Average Annual Wages and Salaries - FLIGHT ATTENDANT PERSONNEL.xls', 24, 'Average Flight Attendant Employee Wage ($)')
#ground staff equivalents
ground_equiv = import_df('Total employee compensation/Total In-House Passenger, Cargo and Aircraft Handling Employee Equivalents.xls', 23, 'Ground Staff Employee Equivalents')
#average ground staff wages
avg_ground_wage = import_df('Total employee compensation/Average Annual Wages and Salaries - INHOUSE PASSENGER, CARGO AND AIRCRAFT HANDLING PERSONNEL.xls', 24, 'Average Ground Staff Employee Wage ($)')
#maintenance employee equivalents
maint_equiv = import_df('Total employee compensation/Total In-House Maintenance Employee Equivalents.xls', 23, 'Maintenance Staff Equivalents')
#average maintenance employee wage
maint_wage = import_df('Total employee compensation/Average Annual Wages and Salaries - INHOUSE MAINTENANCE PERSONNEL.xls', 24, 'Average Maintenance Staff Wage ($)')
#Management equivalents
mgmnt_equiv = import_df('Total employee compensation/Total Management and Other Employee Equivalents.xls', 23, 'Management and Others Equivalents')
#Management wage
mgmnt_wage = import_df('Total employee compensation/Average Annual Wages and Salaries - GENERAL MANAGEMENT AND OTHER PERSONNEL.xls', 24, 'Average Management and Others Wage ($)')
#all compensations table
compensation=pd.concat([all_wages_avg,NC_wages_avg,FT_equiv,FT_NC_equiv,all_benefits,NC_benefits,pilot_equiv,pilot_wage,FA_equiv,FA_wage,ground_equiv,avg_ground_wage,maint_equiv,maint_wage,mgmnt_equiv,mgmnt_wage], axis=1)
compensation=compensation.astype('float64')

# Productivity
#ASM per $ employee compensation
asm_per_comp = import_df('Total employees and productivity/Total ASMs Produced per Dollar of Employee Compensation.xls', 23, 'ASM Prodcued per Dollar Employee Compensation')
#ASM produced per equivalent employee
asm_per_equiv = import_df('Total employees and productivity/Total ASMs Produced per Employee Equivalent.xls', 23, 'ASM Prodcued per Employee Equivalent')
#ASM produced per dollar pilot compensation 
asm_per_pilot_comp = import_df('Total employees and productivity/Total ASMs Produced per Dollar of Total Pilot Compensation.xls', 24, 'ASM Prodcued per Dollar Pilot Compensation')
#Flight Attendant average block hours per month
FA_avg_hours = import_df('Total employees and productivity/Total Flight Attendant Average Block Hours per Month.xls', 24, 'Total Flight Attendant Block Hours per Month')
# % of maintenance expense outsource
maint_out = import_df('Total employees and productivity/Percent of Maintenance Expenses Outsourced.xls', 24, 'Percentage of maintenance expense outsourced')
#Handling employees per a/c
handling_per_ac = import_df('Total employees and productivity/Passenger, Cargo and Aircraft Handling Employees per Aircraft.xls', 24, 'Handling Employees per Aircraft')
#all productivity table
productivity=pd.concat([asm_per_equiv,asm_per_comp,asm_per_pilot_comp,FA_avg_hours,maint_out,handling_per_ac], axis=1)
productivity=productivity.astype('float64')

# Expenses

#total operating expenses
total_op_expense = import_df('Total Expenses related/System Total Operating Expenses.xls', 25, 'Total Operating Expenses ($)', 9)
#labour expenses
labour_exp = import_df('Total Expenses related/System Total Labor and Related Expenses.xls', 23, 'Total Labour Operating Expenses ($)', 9)
#labour expenses per ASM (LCASM)
LCASM = import_df('Total Expenses related/System Total Labor and Related Expense per Available Seat Mile (LCASM).xls', 23, ' Total Labour Operating Expenses per ASM($)', 9)
#Total Fuel expense
fuel_exp = import_df('Total Expenses related/Total Fuel Expense.xls', 23, 'Total Fuel Expenses ($)', 6)
#Total gallons of fuel
fuel_gallons = import_df('Total Expenses related/Total Gallons of Fuel.xls', 23, 'Total Fuel in Gallons', 6)
#Price per gallon fuel 
fuel_price = import_df('Total Expenses related/Total Price per Gallon of Fuel.xls', 23, 'Fuel price per Gallon ($)')
#fuel expenses per ASM
fuel_asm = import_df('Total Expenses related/Fuel Expense per ASM.xls', 23, 'Total Fuel Expense per ASM')
#fuel expense per passenger
fuel_per_pass = import_df('Total Expenses related/Fuel Expense per Enplaned Passenger.xls', 23, 'Fuel Expense per Enplaned Passenger ($)')
#CASM ex transport
casm_ex_trans = import_df('Total Expenses related/System Total Expense per Available Seat Mile (CASM ex Transport Related).xls', 23, 'CASM excluding Transport ($ per ASM)', -2)
#CASM ex transport, fuel
casm_ex_trans_fuel = import_df('Total Expenses related/System Total Expense per Available Seat Mile (CASM ex fuel and Transport Related).xls', 23, 'CASM excluding Transport and Fuel ($ per ASM)', -2)                         
#CASM ex transport, fuel, labour
casm_ex_trans_fuel_lab = import_df('Total Expenses related/System Non-Labor Expense per Available Seat Mile (CASM ex fuel, Transport Related and Labor).xls', 23, 'CASM excluding Transport Fuel and labour ($ per ASM)', -2)                          
#transport related 
transport_exp = import_df('Total Expenses related/Transport Related Expenses.xls', 23, 'Transport Related Expenses ($)', 6)
#other outsourcing 
outsource_exp = import_df('Total Expenses related/Total Other Outsourcing Expense.xls', 23, 'Other Outsourcing Expenses ($)', 6)
#management and other 
mgmnt_exp = import_df('Total Expenses related/System Total Management and Other.xls', 23, 'Management and Other Expenses ($)', 9)
#mgmnt and other per ASM
mgmnt_asm = import_df('Total Expenses related/System Management and Other Expense per Available Seat Mile.xls', 23, 'Management and Other Expenses per ASM')
#flight equipment maintenance 
equip_maint_exp = import_df('Total Expenses related/Total Flight Equipment Maintenance Expense.xls', 23, 'Flight Equipment Maintenance Expense ($)', 6)
#outsource flight equipment maintenance
outs_equip_maint = import_df('Total Expenses related/Total Outsourced Flight Equipment Maintenance Expense.xls', 23, 'Outsourced Flight Equipment Maintenance Expense ($)', 6)
#all expenses table
expenses=pd.concat([total_op_expense,labour_exp,LCASM,fuel_exp,fuel_asm,fuel_per_pass,fuel_gallons,fuel_price,transport_exp,outsource_exp,mgmnt_exp,mgmnt_asm,equip_maint_exp,outs_equip_maint, casm_ex_trans,casm_ex_trans_fuel,casm_ex_trans_fuel_lab], axis=1)
expenses=expenses.astype('float64')

# Fleet

#total operating fleet
op_fleet = import_df('Total fleet/Total Operating Fleet.xls', 23, 'Total operating fleet numbers')
#aircraft block hours
ac_block_hours = import_df('Total fleet/Total Aircraft Block Hours - ALL AIRCRAFT.xls', 24, 'Total Aircraft Block Hours')
#aircraft airborne hours
ac_airborne_hours = import_df('Total fleet/Total Aircraft Airborne Hours - ALL AIRCRAFT.xls', 24, 'Total Aircraft Airborne Hours')
#Gallons of fuel per block hour
gallons_per_block_hour = import_df('Total fleet/Gallons of Fuel per Block Hour - ALL AIRCRAFT.xls', 24, 'Gallons Fuel per Block Hour')
#Average stage length flown
avg_stage_length = import_df('Total fleet/Average Stage Length Flown of Total Operating Fleet.xls', 23, 'Average Stage Length Flown')
#Average seat capacity 
avg_seat_capacity = import_df('Total fleet/Average Seat Capacity of Total Operating Fleet.xls', 23, 'Average Seat Capacity per Aircraft')
#Departure per aircraft day
dep_per_ac = import_df('Total fleet/Departure per Aircraft Day - ALL AIRCRAFT.xls', 21, 'Departure per Aircraft Day', remove_unnamed= False)
#all total fleet data
fleet=pd.concat([op_fleet,ac_block_hours,ac_airborne_hours,gallons_per_block_hour,avg_stage_length,avg_seat_capacity,dep_per_ac], axis=1)
fleet=fleet.astype('float64')

# Traffic and Capacity

#total asm
total_asm = import_df('Total traffic and capacity by op region/Total System Available Seat Miles.xls', 25, 'Total Available Seat Miles (ASM)', 6)
#total rpm
total_rpm = import_df('Total traffic and capacity by op region/Total System Revenue Passenger Miles.xls', 25, 'Total Revenue Passenger Mile (RPM)', 6)
#total enplaned passengers
total_pass = import_df('Total traffic and capacity by op region/System Total Enplaned Passengers.xls', 23, 'Total Enplaned Passengers', 3)
#total load factor
total_load = import_df('Total traffic and capacity by op region/Total System Load Factor.xls', 23, 'Total Load Factor (%)', 2)
#total passenger yield
total_yield = import_df('Total traffic and capacity by op region/Total System Passenger Yield.xls', 23, 'Total Passenger Yield ($ per RPM)', -2)
#total passenger revenue per asm
total_rev_per_asm = import_df('Total traffic and capacity by op region/Total System Passenger Revenue per ASM.xls', 23, 'Total Passenger RPM per ASM ($ per ASM)', -2)
#departed seats
dep_seats = import_df('Total traffic and capacity by op region/Departed Seats.xls', 21, 'Total Departed Seats', 3, remove_unnamed=False)
total_traffic=pd.concat([total_asm,total_rpm,total_pass,total_yield,total_rev_per_asm,total_load,dep_seats], axis=1)
total_traffic=total_traffic.astype('float64')


# Domestic 
#asm
dom_asm = import_df('Total traffic and capacity by op region/Domestic Available Seat Miles .xls', 25, 'Domestic ASM')
#asm % total asm
dom_asm_perc = import_df('Total traffic and capacity by op region/Domestic Available Seat Miles as  a Percent of Total System Available Seat Miles.xls', 23, 'Domestic ASM as % of total ASM', 2)
#revenue % total revenue
dom_rev_perc = import_df('Total traffic and capacity by op region/Domestic Passenger Revenue as a Percent of Total System Revenue.xls', 23, 'Domestic Revenue as % of total Revenue', 2)
#revenue per asm
dom_rev_per_asm = import_df('Total traffic and capacity by op region/Domestic Passenger Revenue per ASM.xls', 23, 'Domestic Revenue ($) per ASM', -2)
#passenger yeild
dom_pass_yield = import_df('Total traffic and capacity by op region/Domestic Passenger Yield.xls', 23, 'Domestic Passenger Yield ($ per RPM)', -2)  
#RPM
dom_rpm = import_df('Total traffic and capacity by op region/Domestic Revenue Passenger Miles.xls', 25, 'Domestic RPM', 6)
#rpm % total rpm
dom_rpm_perc = import_df('Total traffic and capacity by op region/Domestic Revenue Passenger Miles as a Percent of Total System Revenue Passenger Miles.xls', 23, 'Domestic RPM as % of total RPM', 2)
#revenue
dom_rev = import_df('Total traffic and capacity by op region/Passenger Revenue -- Domestic Operations.xls', 25, 'Domestic Passenger Revenue ($)', 6)
#load factor
dom_load_factor = import_df('Total traffic and capacity by op region/Domestic Load Factor.xls', 23, 'Domestic Load Factor (%)', 2)
#all domestic data
dom_traffic=pd.concat([dom_asm,dom_asm_perc,dom_rev_perc,dom_rev_per_asm,dom_pass_yield, dom_rpm,dom_rpm_perc,dom_rev,dom_load_factor], axis=1)
dom_traffic=dom_traffic.astype('float64')

#International
#asm
inter_asm = import_df('Total traffic and capacity by op region/International Available Seat Miles .xls', 25,'International ASM', 6)
#asm % total asm
inter_asm_perc = import_df('Total traffic and capacity by op region/International Available Seat Miles as  a Percent of Total System Available Seat Miles.xls', 23, 'Internation ASM as % of total ASM', 2)
#revenue % total revenue
inter_rev_perc = import_df('Total traffic and capacity by op region/International Passenger Revenue as a Percent of Total System Revenue.xls', 23, 'International Revenue as % of total Revenue', 2)
#revenue per asm
inter_rev_per_asm = import_df('Total traffic and capacity by op region/International Passenger Revenue per ASM.xls', 23, 'International Revenue ($) per ASM',-2)
#passenger yeild
inter_pass_yield = import_df('Total traffic and capacity by op region/International Passenger Yield.xls', 23, 'International Passenger Yield ($ per RPM)', -2)
#RPM
inter_rpm = import_df('Total traffic and capacity by op region/International Revenue Passenger Miles.xls', 25, 'International RPM', 6)
#rpm % total rpm
inter_rpm_perc = import_df('Total traffic and capacity by op region/International Revenue Passenger Miles as a Percent of Total System Revenue Passenger Miles.xls', 23, 'International RPM as % of total RPM', 2)
#revenue
inter_rev = import_df('Total traffic and capacity by op region/Passenger Revenue -- International Operations.xls', 25, 'International Passenger Revenue ($)', 6)
#load factor
inter_load_factor = import_df('Total traffic and capacity by op region/International Load Factor .xls', 23, 'International Load Factor', 2)
inter_traffic=pd.concat([inter_asm,inter_asm_perc,inter_rev_perc,inter_rev_per_asm,inter_pass_yield,inter_rpm,inter_rpm_perc,inter_rev,inter_load_factor], axis=1)
inter_traffic=inter_traffic.astype('float64')

In [None]:
#Revenue - recreate table with individual airlines and their types
airline_rev=pd.read_excel("/kaggle/input/airline-data-project-mit-1995-2019/Original MIT data/Original MIT data/Total revenue/System Total Operating Revenue.xls")
airline_rev=airline_rev.drop(["Unnamed: 1"],axis=1) #removing empty column
airline_rev=airline_rev.transpose() #transposing data frame 
header=airline_rev.iloc[0] #make new list of header _revfrom airline names
airline_rev=airline_rev[1:] #deleting unwanted row
airline_rev=airline_rev.rename(columns = header) #replacing the column headers with list header we just made
Year=np.arange(1995,2019,1)
airline_rev["Year"]=Year #make a series to fill in year column
airline_rev=airline_rev.loc[:,["Year","American","Delta","United"," --sub Network","Southwest","Frontier","Alaska",
                               " -- sub LCC","Hawaiian","Spirit"," -- sub Other","Total Industry"]] #take relevant columns only

airline_rev.reset_index(drop=True, inplace=True) #reset index
airline_rev.set_index("Year", inplace=True) #make year the new index
airline_rev=airline_rev*10**9
airline_rev.astype('float64')

# Revenue Individual airlines
American_rev=airline_rev["American"].sum()
Delta_rev=airline_rev["Delta"].sum()
United_rev=airline_rev["United"].sum()
Southwest_rev=airline_rev["Southwest"] .sum()
Frontier_rev=airline_rev["Frontier"].sum()
Alaska_rev=airline_rev["Alaska"].sum()
Hawaiian_rev=airline_rev["Hawaiian"].sum()
Spirit_rev=airline_rev["Spirit"].sum()

airlines_rev=np.array([American_rev,Delta_rev,United_rev,Southwest_rev,Frontier_rev,Alaska_rev,Hawaiian_rev,Spirit_rev])
Labels1=["","American","Delta","United","Southwest","Frontier","Alaska","Hawaiian","Spirit"]

#airline types
network_rev=airline_rev[" --sub Network"].sum()
LCC_rev=airline_rev[" -- sub LCC"].sum()
Other_rev=airline_rev[" -- sub Other"].sum()

airline_types_rev=[network_rev, LCC_rev, Other_rev]
Labels2=["","Network","LCC","Other"]

#Expenses - recreate table with individual airlines and their types
airline_exp=pd.read_excel("/kaggle/input/airline-data-project-mit-1995-2019/Original MIT data/Original MIT data/Total Expenses related/System Total Operating Expenses.xls")
airline_exp=airline_exp.transpose() #transposing data frame 
header=airline_exp.iloc[0] #make new list of header from airline names
airline_exp=airline_exp.rename(columns = header) #replacing the column headers with list header we just made
airline_exp=airline_exp[2:]
Year=np.arange(1995,2019,1)
airline_exp["Year"]=Year #make a series to fill in year column
airline_exp=airline_exp.loc[:,["Year","American","Delta","United"," --sub Network","Southwest","Frontier","Alaska",
                               " -- sub LCC","Hawaiian","Spirit"," -- sub Other","Total Industry"]] #take relevant columns only
airline_exp.reset_index(drop=True, inplace=True) #reset index
airline_exp.set_index("Year", inplace=True) #make year the new index
airline_exp=airline_exp*10**9
airline_exp.astype('float64')

#Expenses - Individual airlines
American_exp=airline_exp["American"].sum()
Delta_exp=airline_exp["Delta"].sum()
United_exp=airline_exp["United"].sum()
Southwest_exp=airline_exp["Southwest"] .sum()
Frontier_exp=airline_exp["Frontier"].sum()
Alaska_exp=airline_exp["Alaska"].sum()
Hawaiian_exp=airline_exp["Hawaiian"].sum()
Spirit_exp=airline_exp["Spirit"].sum()

airlines_exp=np.array([American_exp,Delta_exp,United_exp,Southwest_exp,Frontier_exp,Alaska_exp,Hawaiian_exp,Spirit_exp])
Labels3=["","American","Delta","United","Southwest","Frontier","Alaska","Hawaiian","Spirit"]

network_exp=airline_exp[" --sub Network"].sum()
LCC_exp=airline_exp[" -- sub LCC"].sum()
Other_exp=airline_exp[" -- sub Other"].sum()

airline_types_exp=np.array([network_exp,LCC_exp ,Other_exp])
airline_types_rev=np.array([airline_rev[" --sub Network"].sum(), airline_rev[" -- sub LCC"].sum(),airline_rev[" -- sub Other"].sum()])

Labels4=[" ","Network","LCC","Other"]

# profits
airlines_prof=airlines_rev-airlines_exp

airline_types_prof=airline_types_rev-airline_types_exp


In [None]:
#passengers enplaned - recreate table with individual airlines and their types
airline_pass=pd.read_excel("/kaggle/input/airline-data-project-mit-1995-2019/Original MIT data/Original MIT data/Total traffic and capacity by op region/System Total Enplaned Passengers.xls")
airline_pass=airline_pass.drop(["Unnamed: 1"],axis=1) #removing empty column
airline_pass=airline_pass.transpose() #transposing data frame 
header=airline_pass.iloc[0] #make new list of header _revfrom airline names
airline_pass=airline_pass[1:] #deleting unwanted row
airline_pass=airline_pass.rename(columns = header) #replacing the column headers with list header we just made

#airline_pass.reindex(columns=filtered_columns)
Year=np.arange(1995,2019,1)
airline_pass["Year"]=Year #make a series to fill in year column

airline_pass=airline_pass.loc[:,["Year","American","Delta","United"," --sub Network","Southwest","Frontier","Alaska",
                               " --sub LCC","Hawaiian","Spirit"," -- sub Other","Total All Sectors"]] #take relevant columns only

airline_pass.reset_index(drop=True, inplace=True) #reset index
airline_pass.set_index("Year", inplace=True) #make year the new index
airline_pass=airline_pass*10**3
airline_pass.astype('float64')

# Revenue Individual airlines
American_pass=airline_rev["American"].sum()
Delta_pass=airline_rev["Delta"].sum()
United_pass=airline_rev["United"].sum()
Southwest_pass=airline_rev["Southwest"] .sum()
Frontier_pass=airline_rev["Frontier"].sum()
Alaska_pass=airline_rev["Alaska"].sum()
Hawaiian_pass=airline_rev["Hawaiian"].sum()
Spirit_pass=airline_rev["Spirit"].sum()

airlines_pass=np.array([American_pass,Delta_pass,United_pass,Southwest_pass,Frontier_pass,Alaska_pass,Hawaiian_pass,Spirit_pass])
pass_labels=["American","Delta","United","Southwest","Frontier","Alaska","Hawaiian","Spirit"]

#airline types
network_pass=airline_pass[" --sub Network"].sum()
LCC_pass=airline_pass[" --sub LCC"].sum()
Other_pass=airline_pass[" -- sub Other"].sum()

airline_types_pass=[network_pass, LCC_pass, Other_pass]
pass_type_labels=["Network","LCC","Other"]

# Collecting variables

In [None]:
df = pd.concat([revenue,inter_traffic,dom_traffic,total_traffic,fleet,expenses,productivity,compensation], axis=1)
df.shape

# Feature Engineering
Total Profits and the number of flights are very important metrics that are relevant to our research questions and tell us alot about the industry but they are not implicitly specified in the data set.

In [None]:
#profits
df["Total Profits"] = df["Total Operating Revenue ($)"]- df["Total Operating Expenses ($)"]

#Number of flights per year and day
df["Flights per Year"]=(df["Total Departed Seats"]/df["Average Seat Capacity per Aircraft"])/1000
df["Flights per day"]=(df["Total Departed Seats"]/df["Average Seat Capacity per Aircraft"])/(365)

# Dimension Reduction and Feature Selection

### PCA

There is a large number of features present in the data set. Reducing the number of dimensions and looking into the principle components can help us understand which features are important and if there is any underlying structure to the data set. 

In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

Scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
df_scaled = Scaler.fit_transform(df)
df_normal = pd.DataFrame(df_scaled)

pca = PCA(n_components=2)
pca.fit(df_normal)
print("First principle component accounts for ",pca.explained_variance_ratio_[0], "of all variance")
print("Second principle component accounts for ",pca.explained_variance_ratio_[1], "of all variance")
df_normal.head()

In [None]:
df_projected=pca.transform(df_normal)
print(df_projected.shape)
plt.figure(figsize=(8,8))
plt.suptitle('First two components')
plt.xlabel('PC_1')
plt.ylabel('PC_2')
plt.scatter(df_projected[:,0], df_projected[:,1], c = "#D06B36", s = 50, linewidth=0)
#need to normalise range normalise, scale with min and max or wrt to std with z scores for more robust

Theres seems to be no clear relationship between principle components

In [None]:
column_names=np.asarray(df.columns.values)
#show top 10 loadings of PC 1  
pc_1_loadings = np.asarray(pca.components_[0])[np.argsort( np.abs(pca.components_[0]))[::-1]][0:10]
pc_1_names = np.asarray(column_names)[np.argsort( np.abs(pca.components_[0]))[::-1]][0:10]

for i in range(0, 10):
    print ( "Column \"" , pc_1_names[i] , "\" has a loading of: ", pc_1_loadings[i])

In [None]:
#show top 10 loadings of PC 2  
pc_2_loadings = np.asarray(pca.components_[1])[np.argsort( np.abs(pca.components_[1]))[::-1]][0:10]
pc_2_names = np.asarray(column_names)[np.argsort( np.abs(pca.components_[1]))[::-1]][0:10]

for i in range(0, 10):
    print ( "Column \"" , pc_2_names[i] , "\" has a loading of: ", pc_2_loadings[i])

### Further PCA

We can attempt try to do a more local PCA analysis to see if the components can be better interpreted :

#### Local Analysis:

In [None]:
#select first component 
columns_selected = np.argsort( np.abs(pca.components_[0]))[::-1][0:10]
column_names_selected = column_names[columns_selected]
df_selected = df_normal[:][columns_selected]
df_selected=pd.DataFrame(df_selected)
df_selected.columns=column_names_selected

In [None]:
# Build a model that will return two principal components
pca_selected = PCA(n_components=2)

# We first fit a PCA model to the data
pca_selected.fit(df_selected)

In [None]:
projected_df_selected = pca_selected.transform(df_selected)

plt.figure(figsize=(8,8))
plt.suptitle('PCs for Ailine Capacity and Income Variable')
plt.xlabel('PC_1')
plt.ylabel('PC_2')
plt.scatter(projected_df_selected[:,0], projected_df_selected[:,1])

In [None]:
print ("--- Firstly, the first component: ")
pc_1_loadings = np.asarray(pca_selected.components_[0])[np.argsort( np.abs(pca_selected.components_[0]))[::-1]][0:10]
pc_1_names = np.asarray(column_names_selected)[np.argsort( np.abs(pca_selected.components_[0]))[::-1]][0:10]

for i in range(0, 10):
    print ( "Column \"" , pc_1_names[i] , "\" has a loading of: ", pc_1_loadings[i])

In [None]:
print ("--- Secondly, the second component: ")
pc_2_loadings = np.asarray(pca_selected.components_[1])[np.argsort( np.abs(pca_selected.components_[1]))[::-1]][0:10]
pc_2_names = np.asarray(column_names_selected)[np.argsort( np.abs(pca_selected.components_[1]))[::-1]][0:10]

for i in range(0, 10):
    print ( "Column \"" , pc_2_names[i] , "\" has a loading of: ", pc_2_loadings[i])

Both components from a locally selected dataset return similar results. We have not been able to derive any further detail or structure from this further analysis. 

# Correlations accross all features with Revenue and Profits

In [None]:
corr = df.corr()
#print(corr.head())

plt.figure(figsize = (15,15))
import seaborn as sns

mask=np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)]=True

ax = sns.heatmap(
    corr, 
    mask=mask,
    vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200))

In [None]:
corr = df.corr()

#sort by the amount of correlation
revenue_corr = corr[["Total Operating Revenue ($)"]]
revenue_corr=revenue_corr.sort_values(by =["Total Operating Revenue ($)"],ascending=False)

profit_corr = corr[["Total Profits"]]
profit_corr=profit_corr.sort_values(by =["Total Profits"],ascending=False)

plt.figure(figsize = (20,20))

ax=plt.subplot(121)

import seaborn as sns
ax = sns.heatmap(
    revenue_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True)
ax.set_title("Revenue Correlation Bar", fontsize=15, color='dimgrey')

ax1=plt.subplot(122)

ax1 = sns.heatmap(
    profit_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True)
ax1.set_title("Profits Correlation Bar", fontsize=15, color='dimgrey')

In [None]:
corr = df[column_names_selected].corr()
#print(corr.head())

plt.figure(figsize = (5,5))
import seaborn as sns

mask=np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)]=True

ax = sns.heatmap(
    corr, 
    mask=mask,
    vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200)
)

In [None]:
#Summary statistics of top PCA loading feautres
df[column_names_selected].describe()

### Pair Plot of selected features

In [None]:
sns.pairplot(df[column_names_selected])

The selected features are dominated by a mix of strong positive or negative correlations between them as demonstrated by the correlation heat map prior.

# Feature Importance

# EDA

# Revenue, Expenses and Profit
Looking at trends in Revenue, Expenses and Profits across the industry, and in it's subsets.

In [None]:
color = sns.color_palette("RdBu")
#sns.palplot(color)

#tick formatter 
def billions(x, pos):
    return '$%1.1fB' % (x*1e-9)
formatter = FuncFormatter(billions)

plt.figure(figsize=(15, 7.5)) 
ax1 = plt.subplot(111)  

Year=np.arange(1995,2019,1)

#turn off axis spines
ax1.spines["top"].set_visible(False)    
ax1.spines["bottom"].set_visible(False)    
ax1.spines["right"].set_visible(False)    
ax1.spines["left"].set_visible(False) 

#share x axis
ax2 = ax1.twinx()

#turn off axis spines
ax2.spines["top"].set_visible(False)    
ax2.spines["bottom"].set_visible(False)    
ax2.spines["right"].set_visible(False)    
ax2.spines["left"].set_visible(False)

#x axis tick parameters
ax1.xaxis.set_tick_params(labelsize=10, rotation=45)
ax1.xaxis.set_ticks(np.arange(1995, 2018, 1))

ax1.tick_params(axis="both", which="both", bottom=False, top=False, labelbottom=True, left=False, right=False, labelleft=True)    
ax2.tick_params(right=False) 

ax1.set_ylabel("Profit", fontsize=12, rotation=0)

ax2.set_ylabel("Revenues \nand \nExpenses",fontsize=12,rotation=0)
ax2.yaxis.set_label_coords(1.1,0.5)

#plot data on axis
ax1.bar(Year, df["Total Profits"], label= "Profit/Loss", alpha=0.5, color=(df["Total Profits"] > 0).map({True: 'g', False: 'r'}))

ax2.plot(df["Total Operating Revenue ($)"], label ="Revenue",color =color[-1],ls="--", lw=2.5)
ax2.plot((df["Total Operating Expenses ($)"]), label ="Expenses",color =color[0],ls="--", lw=2.5)

#format y ticks to billions
ax1.yaxis.set_major_formatter(formatter)
ax2.yaxis.set_major_formatter(formatter)
ax2.set_ylim(0,230e9 )

#add legend
ax1.legend(bbox_to_anchor=(0.13, 0.9))
ax2.legend(bbox_to_anchor=(0.125, 1))
ax1.grid(False)
ax2.grid(False)
ax1.xaxis.grid(which="major")

#font parameters
title_font = {'family': 'sans-serif','color':  'dimgrey','weight': 'normal','size': 15,}#title font
plt.title("Total Industry Profits - 1995 to 2018 ", loc='center',fontdict=title_font, fontsize=15)

In [None]:
color = sns.color_palette("tab20c")
sns.palplot(color)

barwidth1=0.2
barwidth2=0.2

plt.figure(figsize=(17.5,15))

ax1=plt.subplot(211)

clrs1=[color[0],color[0],color[0],color[4],color[4],color[8],color[8],color[8]]
clrs2=[color[1],color[5],color[9]]

clrs3=[color[2],color[2],color[2],color[6],color[6],color[10],color[10],color[10]]
clrs4=[color[0],color[4],color[8]]

clrs5=[color[1],color[1],color[1],color[5],color[5],color[9],color[9],color[9]]
clrs6=[color[1],color[5],color[9]]

rev_x1=(np.arange(1,9))-0.25
exp_x1=(np.arange(1,9))
rev_x2=np.arange(1,4)-0.25
exp_x2=np.arange(1,4)

prof_x1=(np.arange(1,9))+0.25
prof_x2=np.arange(1,4)+0.25

ax1.bar(rev_x1, airlines_rev, color=clrs1,width=barwidth1)
ax1.bar(exp_x1, airlines_exp, color=clrs3, width=barwidth1)
ax1.bar(prof_x1, airlines_prof, color=clrs5, width=barwidth1)
ax1.yaxis.set_major_formatter(formatter)
ax1.set_ylabel("Dollars ($)")
ax1.set_xlabel("Airline")
ax1.set_xticklabels(Labels1)
ax1.set_title("Cumulative Revenue-Expense-Profits of Airlines 1995-2018")
ax1.grid(axis='y')

ax2=plt.subplot(212)

ax2.bar(rev_x2,airline_types_rev, color=clrs2,alpha=0.8,width=barwidth2)
ax2.bar(exp_x2,airline_types_exp, color=clrs4,alpha=0.8,width=barwidth2)
ax2.bar(prof_x2,airline_types_prof, color=clrs6,alpha=0.8,width=barwidth2)
ax2.yaxis.set_major_formatter(formatter)
ax2.set_ylabel("Dollars ($)")
ax2.set_xlabel("Airline Type")
ax2.set_xticks((np.arange(4)))
ax2.set_xticklabels(Labels4)
ax2.set_xlim(0.4,3.6)
ax2.set_title("Cumulative Revenue-Expense-Profits of Airline types 1995-2018")
ax2.grid(axis='y')

print("Network carrier produce",(airline_types_prof/airline_types_rev)[0]*100, "% profit")
print("Low cost carrier produce",(airline_types_prof/airline_types_rev)[1]*100, "% profit")
print("Other airlines produce",(airline_types_prof/airline_types_rev)[2]*100, "% profit")


print(airline_types_prof)
print("ratio of Network profit vs LCC ",(airline_types_prof[0]/airline_types_prof[1]))

In [None]:
plt.figure(figsize=(15,15))
Labels=["Network","LCC","Other"]
ax1=plt.subplot(121)
ax1.pie(airline_types_rev, labels=Labels, autopct='%1.1f%%')
ax1.set_title("Airline Revenue")

ax2=plt.subplot(122)
ax2.pie(airline_types_exp, labels=Labels, autopct='%1.1f%%')
ax2.set_title("Airline Expense")

## Domestic vs International 

### differences and importance

In [None]:
df['International ASM']

In [None]:
#tick formatter 
def billions(x, pos):
    return '%1.1fB' % (x*1e-9)
formatter = FuncFormatter(billions)

plt.figure(figsize=(12.5,10))
ax1=plt.subplot(221)

p1=plt.bar(Year, df["Domestic ASM"], label="Domestic")
p2=plt.bar(Year, df["International ASM"], label="International")
ax1.yaxis.set_major_formatter(formatter)
plt.title("Available Seat Miles (ASM)")
plt.legend()

ax2=plt.subplot(222)

p3=plt.bar(Year, df["Domestic RPM"],label="Domestic")
p4=plt.bar(Year, df["International RPM"],label="International")
ax2.yaxis.set_major_formatter(formatter)
plt.title("Revenue Per Mile (RPM)")
plt.legend()

ax3=plt.subplot(223)

p5=plt.bar(Year, df["Domestic Load Factor (%)"],label="Domestic")
p6=plt.bar(Year, df["International Load Factor"],label="International", alpha=0.5)
plt.title("Load Factor")
plt.legend()

ax4=plt.subplot(224)

p7=plt.bar(Year, df["Domestic Passenger Revenue ($)"],label="Domestic")
p8=plt.bar(Year, df["International Passenger Revenue ($)"],label="International")
ax4.yaxis.set_major_formatter(formatter)
plt.title("Passenger Revenue")
plt.legend()           
plt.show()
           
print("Avg Domestic ASM % of total: ",np.mean(df["Domestic ASM as % of total ASM"]))
print("Avg International ASM % of total: ", np.mean(df["Internation ASM as % of total ASM"]))
print("")

print("Avg Domestic RPM % of total: ",np.mean(df["Domestic RPM as % of total RPM"]))
print("Avg International RPM % of total: ", np.mean(df["International RPM as % of total RPM"]))
print("")

print("Avg Domestic Load factor", np.mean(df["Domestic Load Factor (%)"]))
print("Avg International Load Factor", np.mean(df["International Load Factor"]))
print("")           

print("Avg Domestic Revenue % of total: ", np.mean(df["Domestic Revenue as % of total Revenue"]))
print("Avg International Revenue % of total: ", np.mean(df["International Revenue as % of total Revenue"]))
print("")

In [None]:
#airline types
network_rev=airline_rev[" --sub Network"]
LCC_rev=airline_rev[" -- sub LCC"]
Other_rev=airline_rev[" -- sub Other"]

airline_types_rev=[network_rev, LCC_rev, Other_rev]
Labels2=["Network Airlines","LCC Airlines","Other Airlines"]

plt.figure(figsize=(10,5))
ax1=plt.subplot(111)
ax1.plot(network_rev,label=Labels2[0])
ax1.plot(LCC_rev,label=Labels2[1])
ax1.plot(Other_rev,label=Labels2[2])
ax1.set_xlabel("Year")
ax1.set_ylabel("Revenue")
ax1.grid(axis='x')
ax1.set_title("Revenue of Different Airline Business models")
ax1.yaxis.set_major_formatter(formatter)
plt.legend()

In [None]:
plt.figure(figsize=(10,5))
ax1=plt.subplot(111)
ax1.plot(df["Domestic Passenger Revenue ($)"], label="Domestic Flights")
ax1.plot(df["International Passenger Revenue ($)"], label="International Flights")
ax1.grid(axis='x')
ax1.set_xlabel("Year")
ax1.set_ylabel("Revenue")
ax1.set_title("Revenue of Different flight types")
ax1.yaxis.set_major_formatter(formatter)
plt.legend()

In [None]:
plt.plot(df["Domestic Revenue as % of total Revenue"])
plt.plot(df["International Revenue as % of total Revenue"])

# Flight Traffic

### Flights and Passengers

ASM describes the capacity to generate revenue so it would make sense to look into to traffic metrics as this is what gives rise to the available capacity. 
We will look into number of flights, number of total passengers against revenue as well as ASM and RPM:

In [None]:
plt.figure(figsize=(12.5, 12.5)) 
color = sns.color_palette("RdBu")
ax1 = plt.subplot(211)  

ax1.spines["top"].set_visible(False)    
ax1.spines["bottom"].set_visible(False)    
ax1.spines["right"].set_visible(False)    
ax1.spines["left"].set_visible(False) 

ax2 = ax1.twinx()

ax2.spines["top"].set_visible(False)    
ax2.spines["bottom"].set_visible(False)    
ax2.spines["right"].set_visible(False)    
ax2.spines["left"].set_visible(False)

ax1.xaxis.set_tick_params(labelsize=10)
ax1.xaxis.set_ticks(np.arange(1995, 2018, 2))


ax1.tick_params(axis="both", which="both", bottom=False, top=False, labelbottom=True, left=False, right=False, labelleft=True)    
ax2.tick_params(right=False) 

ax1.set_ylabel("Flights Per Day (Thousand)",color="dimgrey")
ax2.set_ylabel("Revenue ($)",color="dimgrey")

ax1.plot(df["Flights per day"],ls=":",lw=2.5,color=color[-1],label="Industry Flights Per Day")
ax2.plot(total_revenue,color =color[0],ls=":", lw=2.5, label="Industry Total Revenue")

ax1.legend(bbox_to_anchor=(0.1, 0.08), loc=2,)
ax2.legend(bbox_to_anchor=(0.1, 0.13), loc=2,)
ax1.grid(False)
ax2.grid(False)
ax1.xaxis.grid(which="major")

#format y ticks to billions for axis 2

ax2.yaxis.set_major_formatter(formatter)

title_font = {'family': 'sans-serif','color':  'dimgrey','weight': 'normal','size': 12,}#title font
plt.title("Flights per Day vs Revenue - 1995 to 2018 ", loc='center',fontdict=title_font, fontsize=15)

ax3 = plt.subplot(212)  

ax3.spines["top"].set_visible(False)    
ax3.spines["bottom"].set_visible(False)    
ax3.spines["right"].set_visible(False)    
ax3.spines["left"].set_visible(False) 

ax4 = ax3.twinx()

ax4.spines["top"].set_visible(False)    
ax4.spines["bottom"].set_visible(False)    
ax4.spines["right"].set_visible(False)    
ax4.spines["left"].set_visible(False)

ax3.xaxis.set_tick_params(labelsize=10)
ax3.xaxis.set_ticks(np.arange(1995, 2018, 2))


ax3.tick_params(axis="both", which="both", bottom=False, top=False, labelbottom=True, left=False, right=False, labelleft=True)    
ax4.tick_params(right=False) 

ax3.set_ylabel("Enplaned Passengers",color="dimgrey")
ax4.set_ylabel("Revenue ($)",color="dimgrey")

ax3.plot((total_traffic["Total Enplaned Passengers"]/365),ls=":",lw=2.5,color=color[-1],label="Total Enpland Passengers")
ax4.plot(total_revenue,color =color[0],ls=":", lw=2.5, label="Industry Total Revenue")

ax3.legend(bbox_to_anchor=(0.1, 0.08), loc=2,)
ax4.legend(bbox_to_anchor=(0.1, 0.13), loc=2,)
ax3.grid(False)
ax4.grid(False)
ax3.xaxis.grid(which="major")

#format y ticks to billions for axis 2

ax4.yaxis.set_major_formatter(formatter)

title_font = {'family': 'sans-serif','color':  'dimgrey','weight': 'normal','size': 12,}#title font
plt.title("Passengers per day vs Revenue - 1995 to 2018 ", loc='center',fontdict=title_font, fontsize=15)
import scipy as sp
print("Correlaion of number of passengers with revenue", sp.stats.pearsonr(df["Total Enplaned Passengers"],df["Total Operating Revenue ($)"] )[0])

Very strong correlation between number of enplaned passengers with revenue. The divergence or convergence between the two variables can indicate a rise or drop in how much revenue per passenger is made. External events will have a big influence on how people travel and this shown with the way the dips coincide with said events. It is through the passengers through which external events can shock the operating revenues.

In [None]:
#tick formatter 
def billions(x, pos):
    return '%1.1fB' % (x*1e-9)
formatter = FuncFormatter(billions)

plt.figure(figsize=(17.5,7.5))
ax1=plt.subplot(121)
ax1.bar(pass_labels,airlines_pass, color=clrs1)
ax1.set_title("Total Number of Enplaned Passengers per Airline")
ax1.set_ylabel("Enplaned Passengers")
ax1.set_xlabel("Airline")
ax1.yaxis.set_major_formatter(formatter)
ax1.grid(axis='y')

ax2=plt.subplot(122)
ax2.bar(pass_type_labels,airline_types_pass,color=clrs4)
ax2.set_title("Total Number of Enplaned Passengers per Airline Type")
ax2.set_ylabel("Enplaned Passengers")
ax2.set_xlabel("Airline Type")
ax2.yaxis.set_major_formatter(formatter)
ax2.grid(axis='y')

plt.figure()
ax3=plt.subplot(111)
ax3.set_title("Market share of Passengers")
ax3.pie(airline_types_pass,labels=Labels, autopct='%1.1f%%')

In [None]:
color = sns.color_palette("Paired")
sns.palplot(color)

In [None]:
#Relative Change in ASM and RPM over the years is very small
#Calculate the percentage change and plot that instead of actual values
Year=np.arange(1995,2019,1)

#Percentage change ASM
df["% ASM Change"]=total_asm.pct_change()
df["% RPM Change"]=total_rpm.pct_change()

#fill first gap with 0
df["% RPM Change"]=df["% RPM Change"].fillna(0)
df["% ASM Change"]=df["% ASM Change"].fillna(0)

#define bar height variables
ASM=df["% ASM Change"].fillna(0)
RPM=df["% RPM Change"].fillna(0)

#set bar width
barwidth=0.35

#create figure
plt.figure(figsize=(10,12.5))
ax1=plt.subplot(311)
plt.subplots_adjust(hspace=0.4)

#x position of bars (asm at x and rpm at x + barwidth)
asm_x=Year
rpm_x=[x+barwidth for x in asm_x]

#plot bars
ax1.bar(asm_x,ASM,width=barwidth,color =color[1],alpha=0.8, label="Available Seat Miles" )
ax1.bar(rpm_x,RPM,width=barwidth, color=color[5],alpha=0.8, label="Revenue Per Mile")


#configure axis, ticks and legend
ax1.set_xlabel("Year")
ax1.set_ylabel("% Change")
ax1.set_title("Year-on-Year Percentage change of ASM and RPM")
#ax1.set_xticks(rotation=45)
ax1.set_xlim(1997,2019)
plt.xticks(rotation=45)
ax1.set_xticks(Year)
ax1.grid(axis='x')
ax1.legend()

ax2=plt.subplot(312)

flights=df["Flights per Year"]


flights_x=Year

plt.bar(flights_x, flights, width=barwidth,color=color[3], alpha=0.8, label="Flights per Day")

#configure axis, ticks and legend
plt.xlabel("Year")
plt.ylabel("Number of Flights")
plt.title("Number of Flights per Year")
plt.xticks(rotation=45)
plt.xlim(1995,2019)
ax2.xaxis.set_ticks(Year)
plt.grid(axis='x')
plt.legend()

ax3=plt.subplot(313)

passengers=(df["Total Enplaned Passengers"])

pass_x=Year

plt.bar(pass_x, passengers, width=barwidth,color=color[7], alpha=0.8, label="Passengers per Year")

#configure axis, ticks and legend
plt.xlabel("Year")
plt.ylabel("Passengers")
plt.title("Number of Passengers per Year")
plt.xticks(rotation=45)
plt.xlim(1995,2019)
ax3.xaxis.set_ticks(Year)
plt.grid(axis='x')
plt.legend()

print("ASM recovered from 2008 by ", (ASM[2018]-ASM[2009]))
print("RPM recovered from 2008 by ", (RPM[2018]-RPM[2009]))
print("")
print("No. of Passengers recovered from 2008 by",((flights[2018]-flights[2009])/flights[2009])*100, "%")
print("No. of Passengers recovered from 2008 by",((passengers[2018]-passengers[2009])/passengers[2009])*100, "%")

- RPM lags behind ASM, however the % change in RPM is amplified in comparison to ASM.
- a larger ASM indicates larger capacity to generate revenue from more seats.
- In 2001 and 2002 ASM growth was reversed, possibly due to the events of 9/11. The growth in RPM dropped but was still positive in 2001 but shrunk drastically in the following year. This again shows the influence of external events on the US Commercial airline Industry.
- between 2003 to 2007 and 2010 to 2018 there is a positive trend in ASM and RPM however 2010 to 2018 has a lesser rate, possibly due to the longer lasting effects of the financial crisis. 
- flights and pasengers numbers generally follow the pattern as ASM and RPM but the increase in passengers in the latter years increased more sharply than number of flights. this would indicate are carrying more passengers on average but RPM and ASM did not increase as you might expect. This could mean a increase in a cost counteracted the expected effect.

Looking into the cost per available seat miles (CASM), as suggested by our PCA, will give us a clearer picture of whats going on.

### Cost per Available Seat Mile (CASM)

In [None]:
plt.figure(figsize=(15,7.5))

casm1=df["CASM excluding Transport ($ per ASM)"]
casm2=df["CASM excluding Transport and Fuel ($ per ASM)"]
casm3=df["CASM excluding Transport Fuel and labour ($ per ASM)"]

p1=plt.bar(Year,casm1, label="CASM excluding transport")
p2=plt.bar(Year,casm2, label="CASM excluding transport and fuel")
p3=plt.bar(Year,casm3, label="CASM exlcuding transport fuel and labour")

plt.title("Cost per Available Seat Mile")
plt.ylabel("$ \nper \nASM", rotation=0)
plt.xlabel("Year")
plt.xticks(Year, rotation=45)
plt.grid(axis='x')
plt.legend(loc='upper left')

plt.show()
print("overall increase of CASM with labour,excluding transport and fuel is ",casm2[2018] - casm2[1995])
print("overall increase of CASM with labour and fuel excluding transport is ",casm1[2018] - casm1[1995])

# Expenses Breakdown

In [None]:
#stack plot of expenses table. 
labels=["Labor","Fuel", "Transport", "Outsourcing","Management and Other", "Maintenance", "Outsourced Maintenance"]

plt.figure(figsize=(10,7.5))
ax=plt.subplot(111)

ax.stackplot(Year, expenses["Total Fuel Expenses ($)"],expenses["Transport Related Expenses ($)"],
              expenses["Other Outsourcing Expenses ($)"],expenses["Management and Other Expenses ($)"],
             expenses["Flight Equipment Maintenance Expense ($)"], 
              expenses["Outsourced Flight Equipment Maintenance Expense ($)"], labels=labels)


ax.yaxis.set_major_formatter(formatter)
ax.set_ylabel("Cumulative Expense $")
ax.set_xlabel("Year")

plt.legend(loc="upper left")
print(expenses["Total Fuel Expenses ($)"][2018])
print(expenses["Transport Related Expenses ($)"][2018])

All expenses follow the same shape with dips at 2001 and 2008 as total revenue and exepnse do shown earlier. in general the proportion of each expenses remains relatively consistent across the years.

In [None]:
# take average percentage of total expenses and plot against each feature selected
plt.figure(figsize=(20,7.5))

ax1=plt.subplot(121)

total=expenses["Total Operating Expenses ($)"]

labor_perc=np.mean((expenses["Total Labour Operating Expenses ($)"]/total)*100)
fuel_perc= np.mean((expenses["Total Fuel Expenses ($)"]/total)*100)
trans_perc=np.mean((expenses["Transport Related Expenses ($)"]/total)*100)
outsource_perc= np.mean((expenses["Other Outsourcing Expenses ($)"]/total)*100)
mgmnt_perc= np.mean((expenses["Management and Other Expenses ($)"]/total)*100)
maint_perc= np.mean((expenses["Flight Equipment Maintenance Expense ($)"]/total)*100)
outs_maint_perc= np.mean((expenses["Outsourced Flight Equipment Maintenance Expense ($)"]/total)*100)

expense_percentages=[labor_perc,fuel_perc, trans_perc,outsource_perc,mgmnt_perc,maint_perc,outs_maint_perc]

ax1=sns.barplot(labels, expense_percentages)

plt.grid(axis='y')
plt.title("Operating Expenses as Average Percentage of Total")
plt.ylabel("%")
plt.xticks(rotation=45)

ax2=plt.subplot(122)

explode = (0.1, 0, 0, 0,0,0,0)  # explode 1st slice

plt.pie(expense_percentages,explode=explode, labels=labels, startangle=52.5)

plt.axis('equal')

### Breakdown of Labour expenses through Employee Compensation

In [None]:
#sankey diagram 1st layer connections 
print("All expenses fraction of revenue:")

print((df["Total Labour Operating Expenses ($)"]).sum()/(df["Total Operating Revenue ($)"].sum()))
print((df["Total Fuel Expenses ($)"]).sum()/(df["Total Operating Revenue ($)"].sum()))
print((df["Transport Related Expenses ($)"]).sum()/(df["Total Operating Revenue ($)"].sum()))
print((df["Other Outsourcing Expenses ($)"]).sum()/(df["Total Operating Revenue ($)"].sum()))
print((df["Management and Other Expenses ($)"]).sum()/(df["Total Operating Revenue ($)"].sum()))
print((df["Flight Equipment Maintenance Expense ($)"]).sum()/(df["Total Operating Revenue ($)"].sum()))
print((df["Outsourced Flight Equipment Maintenance Expense ($)"]).sum()/(df["Total Operating Revenue ($)"].sum()))
print((df["Total Profits"]).sum()/(df["Total Operating Revenue ($)"].sum()))


avg_emp_exp=np.mean(df["Total Labour Operating Expenses ($)"]/df["Full Time Employees Equivalents"])

# sankey 2nd layer - labour expenses to compensation

print("Compensation fraction of revenue:")
print((df["Non Cockpit Employees Avg Wage ($)"]/df["Total Operating Revenue ($)"]).sum())
print((df["All Employees average benefits and pensions ($)"]/df["Total Operating Revenue ($)"]).sum())
print((df["Average Pilot and Co-Pilot wages ($)"]/df["Total Operating Revenue ($)"]).sum())
print((df["Average Flight Attendant Employee Wage ($)"]/df["Total Operating Revenue ($)"]).sum())
print((df["Average Ground Staff Employee Wage ($)"]/df["Total Operating Revenue ($)"]).sum())
print((df["Average Maintenance Staff Wage ($)"]/df["Total Operating Revenue ($)"]).sum())
print((df["Average Management and Others Wage ($)"]/df["Total Operating Revenue ($)"]).sum())


In [None]:
#Sankey diagram of all cash flow for total industry

import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 10,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["Total System Revenue", "Labor Expenses","Fuel Expenses", "Transport Expenses", "OutSourcing Expenses", 
               "Management Expenses", "Flight Equipment Maintenance", "Flight Equipment OutSourcing Expenses", "Profits",
              "Non Cockput Employee Wage", "All Employee Benefits", "Pilot Wage", "Flight Attendant Wage",
               "Ground Staff Wage","Maintenance Wage", "Management/Other Wage"],
      color = ["dodgerblue", "red","blueviolet","gold","orchid","teal", "gray","orange","limegreen",
               "red","red","red","red","red","red","red",]
    ),
    link = dict(
      source = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], # indices correspond to labels, eg A1, A2, A2, B1, ...
      target = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
      value =  [210000, 150000, 80000, 60000, 30000, 60000, 30000, 40000, 6000, 3000, 30000, 7000,7000,10000,12000]
  ))])

fig.update_layout(title_text="Sankey Diagram of Cumulative Cash Flow in US Commercial Airline Industry", font_size=12.5)
fig.show()

In [None]:
employee_change=compensation["Full Time Employees Equivalents"].pct_change()
Year=np.arange(1995,2019,1)

fig, ax = plt.subplots(figsize=(15,7))


ax.bar(Year,employee_change, alpha=0.75, color = 'g')
plt.title("Recruitment Drives/Cuts")
ax.set_ylabel("% change of Full time employee equivalents")
plt.xticks(rotation=45)


ax2 = ax.twinx()

ax2.plot(df["Total Operating Expenses ($)"], lw=1, ls='--', color='r')
ax2.yaxis.set_major_formatter(formatter)
ax2.set_ylabel("Expenses ($)")
ax.grid(axis='x')

import scipy as sp
print("Pearsons Correlation between No. employees and Total Profits is: ", 
      sp.stats.pearsonr(compensation["Full Time Employees Equivalents"],
                        df["Total Profits"])[0])

print("Pearsons Correlation between No. employees and Total system Revenue is: ", 
      sp.stats.pearsonr(compensation["Full Time Employees Equivalents"],
                        df["Total Operating Revenue ($)"])[0])

print("Pearsons Correlation between No. employees and Total system Expenses is: ", 
      sp.stats.pearsonr(compensation["Full Time Employees Equivalents"],
                        -1*df["Total Operating Expenses ($)"])[0])




# Forecasting and Modelling
ARIMA Forescasting on major features

ARIMA on Revenue, Expenses, Profits, Flights per Day or passengers

- Model can be created using statsmodels library as follows:

1) Define model by calling ARIMA() and passing p, d, q parameters and using the Augmented Dickey-Fuller test to determine these values.

2) model is prepared on the training data by calling the fit() function

3) predictions can be made by calling the forecast() function and specifying the index of time period to be predicted

This process will be repeated for series being modelled.

It must be noted that we do not expect spectacular results since the data is presented on a year-by-year basis and there are only 24 data points to work with, rather this is a demonstration of how to implement and assess the components of ARIMA forecasting. 

In [None]:
# need to check if stationary first, two ways: rolling statistics and augmented dickey-fuller test (ADF). 
# we will be using dickey-fuller. series will be considered stationary if the p-value is low (according to null hypothesis)
# and the critical values at 1% 5% 10% confidence intervals are as close as possible to ADF stats.

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters

In [None]:
### Checking series for stationarity and determining a suitable d value (for the I parameter).

In [None]:
# a function to run the two tests which determine whether a given time series is stationary.

def get_stationarity(timeseries):
    
    # rolling statistics
    rolling_mean = timeseries.rolling(window=5).mean()
    rolling_std = timeseries.rolling(window=5).std()
    
    # rolling statistics plot
    original = plt.plot(timeseries, color='blue', label='Original')
    mean = plt.plot(rolling_mean, color='red', label='Rolling Mean')
    std = plt.plot(rolling_std, color='black', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    # Dickeyâ€“Fuller test:
    result = adfuller(timeseries)
    print('ADF Statistic: {}'.format(result[0]))
    print('p-value: {}'.format(result[1]))
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t{}: {}'.format(key, value))

In [None]:
rev = df['Total Operating Revenue ($)'] 
rolling_mean = rev.rolling(window=5).mean()
rev_minus_mean = rev - rolling_mean
rev_minus_mean.dropna(inplace=True)    

get_stationarity(rev_minus_mean)

taking 1 difference of the mean from the revenue made the p value small and brought the value of the Augmented Dickey-Fuller stat close to the required thresholds. we can conclude that the series is stationary. When making our model we now know that a d value of 1 should make the series stationary. 

### Finding a starting point for the p value (AR parameter)

In [None]:
pd.plotting.autocorrelation_plot(rev)

significant positive correlation for for first 4 or 5 lags. 4 may be a good place to start AR parameter (p value) from.

To make a starting guess, a difference order of 1 for the I parameter (d value) will be used to achieve stationarity and a moving average model, the MA parameter (q vaule) of 0 will be used. 

(4,1,0) will be used as a baseline.

In [None]:
model=ARIMA(rev, order=(1,1,0))
model_fit=model.fit(disp=0)
print(model_fit.summary())


In [None]:
#plot residual errors

residuals=pd.DataFrame(model_fit.resid)
residuals.plot()

In [None]:
residuals.plot(kind='kde')

error denisty plot results show that the errors are gaussain and there is a bias in prediction (non-zero mean).


### Rolling forecast ARIMA model:

In [None]:
# using predict() on ARIMA results to make predictions
#train to test split will approx. be 2/3 to 1/3

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


X = rev.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
train_years, test_years  = (1995 + np.arange(0,size)) , (1995 + np.arange(size,len(X)))
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(1,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))

error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)

r_sq=r2_score(test, predictions)
print('R squared: ', r_sq)

# plot
#
plt.plot(test_years, test, label="Test Set")
plt.plot(test_years, predictions, color='red', label="Prediction")
plt.plot(train_years, train, label="Train Set")
plt.grid(axis='x')
plt.legend()
plt.show()

In [None]:
X = rev.values
size = int(len(X))
history = [x for x in X]
predictions = list()
for t in range(10):
    model = ARIMA(history, order=(2,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    history.append(yhat)
    print('predicted=%f' % (yhat))
    
plt.plot(history)
print(len(history))
print(len(X))

In [None]:
plt.figure(figsize=(9,3))

ax1=plt.subplot(111)

px = np.arange(1995,2029)

ax1.plot(px,history, label="Total Industry Revenue")
ax1.vlines(x=2018, ymin=0, ymax=(300*10**11), ls='dashed', color='r')
ax1.set_ylim(min(history), max(history))

c = np.polyfit(Year, rev, 1)
print(c)
p = np.poly1d(c)

py = p(px)
ax1.plot(px, py, lw=0.5, ls='--', label="Linear Fit of Initial Trend")
ax1.set_xlim(1994,2028)
ax1.grid(axis='x')
ax1.yaxis.set_major_formatter(formatter)
plt.legend()
plt.show()

### ARIMA for No. of Passengers Enplaned

In [None]:
passengers=df["Total Enplaned Passengers"]
rolling_mean = passengers.rolling(window=5).mean()
passengers_minus_mean = passengers - rolling_mean
passengers_minus_mean.dropna(inplace=True)    

get_stationarity(passengers_minus_mean)

In [None]:
pd.plotting.autocorrelation_plot(passengers)

In [None]:
model1=ARIMA(passengers, order=(1,1,0))
model_fit1=model1.fit(disp=0)
print(model_fit1.summary())

In [None]:
#plot residual errors

residuals1=pd.DataFrame(model_fit1.resid)
residuals1.plot()

In [None]:
residuals1.plot(kind='kde')

Evidence of a slight positive bias

In [None]:
# using predict() on ARIMA results to make predictions
#train to test split will approx. be 2/3 to 1/3

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X1 = passengers.values
size1 = int(len(X1) * 0.66)
train1, test1 = X1[0:size1], X1[size1:len(X1)]
train_years1, test_years1  = (1995 + np.arange(0,size1)) , (1995 + np.arange(size1,len(X1)))
history1 = [x for x in train1]
predictions1 = list()
for t in range(len(test1)):
    model1 = ARIMA(history1, order=(1,1,0))
    model_fit1 = model1.fit(disp=0)
    output1 = model_fit1.forecast()
    yhat1 = output1[0]
    predictions1.append(yhat1)
    obs1 = test1[t]
    history1.append(obs1)
    print('predicted=%f, expected=%f' % (yhat1, obs1))

error1 = mean_squared_error(test1, predictions1)
print('Test MSE: %.3f' % error1)


r_sq1=r2_score(test1, predictions1)
print('R squared: ', r_sq1)

# plot
#
plt.plot(test_years1, test1, label="Test Set")
plt.plot(test_years1, predictions1, color='red', label="Prediction")
plt.plot(train_years1, train1, label="Train Set")
plt.grid(axis='x')
plt.legend()
plt.show()

In [None]:
X1 = passengers.values
size1 = int(len(X1))
history1 = [x for x in X1]
predictions1 = list()
for t in range(10):
    model1 = ARIMA(history1, order=(2,1,0))
    model_fit1 = model1.fit(disp=0)
    output1 = model_fit1.forecast()
    yhat1 = output1[0]
    history1.append(yhat1)
    print('predicted=%f' % (yhat1))
    
plt.plot(history1)
print(len(history1))
print(len(X1))

In [None]:
plt.figure(figsize=(8,3))

ax1=plt.subplot(111)
px = np.arange(1995,2029)
ax1.plot(px,history1, label="Total Industry Passengers")
ax1.vlines(x=2018, ymin=0, ymax=(300*10**11), ls='dashed', color='r')
ax1.set_ylim(min(history1), max(history1))

c = np.polyfit(Year, passengers, 1)
print(c)
p = np.poly1d(c)

py = p(px)
ax1.plot(px, py, lw=0.5, ls='--', label="Linear Fit of Initial Trend")
ax1.set_xlim(1994,2028)
ax1.grid(axis='x')
ax1.yaxis.set_major_formatter(formatter)
plt.legend()
plt.show()

### ARIMA for expenses

In [None]:
exp=df["Total Operating Expenses ($)"]
rolling_mean = exp.rolling(window=5).mean()
exp_minus_mean = exp - rolling_mean
exp_minus_mean.dropna(inplace=True)    

get_stationarity(exp_minus_mean)

In [None]:
pd.plotting.autocorrelation_plot(exp)

Starting p value of 2 seems reasonable

In [None]:
model2=ARIMA(exp, order=(2,1,0))
model_fit2=model2.fit(disp=0)
print(model_fit2.summary())

In [None]:
#plot residual errors

residuals2=pd.DataFrame(model_fit2.resid)
residuals2.plot()

In [None]:
residuals2.plot(kind='kde')

In [None]:
# using predict() on ARIMA results to make predictions
#train to test split will approx. be 2/3 to 1/3

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X2 = exp.values
size2 = int(len(X2) * 0.66)
train2, test2 = X2[0:size2], X2[size2:len(X2)]
train_years2, test_years2  = (1995 + np.arange(0,size2)) , (1995 + np.arange(size2,len(X2)))
history2 = [x for x in train2]
predictions2 = list()
for t in range(len(test2)):
    model2 = ARIMA(history2, order=(2,1,0))
    model_fit2 = model2.fit(disp=0)
    output2 = model_fit2.forecast()
    yhat2 = output2[0]
    predictions2.append(yhat2)
    obs2 = test2[t]
    history2.append(obs2)
    print('predicted=%f, expected=%f' % (yhat2, obs2))

error2 = mean_squared_error(test2, predictions2)
print('Test MSE: %.3f' % error2)


r_sq2=r2_score(test2, predictions2)
print('R squared: ', r_sq2)

# plot
#
plt.plot(test_years2, test2, label="Test Set")
plt.plot(test_years2, predictions2, color='red', label="Prediction")
plt.plot(train_years2, train2, label="Train Set")
plt.grid(axis='x')
plt.legend()
plt.show()

High, positive R squared shows that ARIMA model is a very good fit and that 94% of the variance is explained

In [None]:
X2 = exp.values
size2 = int(len(X2))
history2 = [x for x in X2]
predictions2 = list()
for t in range(10):
    model2 = ARIMA(history2, order=(2,1,0))
    model_fit2 = model2.fit(disp=0)
    output2 = model_fit2.forecast()
    yhat2 = output2[0]
    history2.append(yhat2)
    print('predicted=%f' % (yhat2))
    
plt.plot(history2)
print(len(history2))
print(len(X2))

In [None]:
prediction_years=np.arange(1995, 2029)
len(prediction_years)

In [None]:
plt.figure(figsize=(8,3))

ax1=plt.subplot(111)

#ax1.plot(test_years2, test2, label="Test Set")
ax1.plot(prediction_years,history2, label="Total Industry Expenses")
ax1.vlines(x=2018, ymin=0, ymax=(300*10**11), ls='dashed', color='r')
ax1.set_ylim(min(history2), max(history2))

c = np.polyfit(Year, exp, 1)
print(c)
p = np.poly1d(c)
px = np.arange(1995,2025)
py = p(px)
ax1.plot(px, py, lw=0.5, ls='--', label="Linear Fit of Initial Trend")

ax1.grid(axis='x')
ax1.yaxis.set_major_formatter(formatter)
plt.legend()
plt.show()

### ARIMA for Profits

In [None]:
prof=df["Total Profits"]
rolling_mean = prof.rolling(window=5).mean()
prof_minus_mean = prof - rolling_mean
prof_minus_mean.dropna(inplace=True)    

get_stationarity(prof_minus_mean)

In [None]:
pd.plotting.autocorrelation_plot(prof)

In [None]:
model3=ARIMA(exp, order=(1,1,0))
model_fit3=model3.fit(disp=0)
print(model_fit3.summary())

In [None]:
#plot residual errors

residuals3=pd.DataFrame(model_fit3.resid)
residuals3.plot()

In [None]:
residuals3.plot(kind='kde')

In [None]:
# using predict() on ARIMA results to make predictions
#train to test split will approx. be 2/3 to 1/3

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X3 = prof.values
size3 = int(len(X3) * 0.66)
train3, test3 = X3[0:size3], X3[size3:len(X3)]
train_years3, test_years3  = (1995 + np.arange(0,size3)) , (1995 + np.arange(size3,len(X3)))
history3 = [x for x in train3]
predictions3 = list()
for t in range(len(test3)):
    model3 = ARIMA(history3, order=(2,1,0))
    model_fit3 = model3.fit(disp=0)
    output3 = model_fit3.forecast()
    yhat3 = output3[0]
    predictions3.append(yhat3)
    obs3 = test3[t]
    history3.append(obs3)
    print('predicted=%f, expected=%f' % (yhat3, obs3))

error3 = mean_squared_error(test3, predictions3)
print('Test MSE: %.3f' % error3)


r_sq3=r2_score(test3, predictions3)
print('R squared: ', r_sq3)

# plot
#
plt.plot(test_years3, test3, label="Test Set")
plt.plot(test_years3, predictions3, color='red', label="Prediction")
plt.plot(train_years3, train3, label="Train Set")
plt.grid(axis='x')
plt.legend()
plt.show()

In [None]:
X3 = prof.values
size3 = int(len(X3))
history3 = [x for x in X3]
predictions3 = list()
for t in range(10):
    model3 = ARIMA(history3, order=(2,1,0))
    model_fit3 = model3.fit(disp=0)
    output3 = model_fit3.forecast()
    yhat3 = output3[0]
    history3.append(yhat3)
    print('predicted=%f' % (yhat3))
    
plt.plot(history3)
print(len(history3))
print(len(X3))

In [None]:
plt.figure(figsize=(10,5))

ax1=plt.subplot(111)
px = np.arange(1995,2029)
#ax1.plot(test_years2, test2, label="Test3Set")
ax1.plot(px,history3, label="10 Year Forecast")
ax1.vlines(x=2018, ymin=min(history3), ymax=(300*10**11), ls='dashed', color='r')
ax1.hlines(y=0, xmin=1990, xmax=2030, ls='dashed', lw=0.5)
ax1.set_ylim(min(history3), max(history3))
ax1.set_xlim(min(prediction_years), max(prediction_years))

c = np.polyfit(Year, prof, 1)
print(c)
p = np.poly1d(c)

py = p(px)
ax1.plot(px, py, lw=0.5, ls='--', label="Linear Fit of Initial Trend")

ax1.grid(axis='x')
ax1.yaxis.set_major_formatter(formatter)
plt.legend()
plt.show()

In [None]:
prof_alternate=np.array(history)-np.array(history2)
plt.plot(prof_alternate)

In [None]:
plt.figure(figsize=(8,3))

ax1=plt.subplot(111)

#ax1.plot(test_years2, test2, label="Test3Set")
ax1.plot(prediction_years,history3, label="Total Industry Profits")
plt.plot(prediction_years[23:], prof_alternate[23:], label='Profits caluculated from revenue and expenses')
ax1.vlines(x=2018, ymin=min(history3), ymax=(300*10**11), ls='dashed', color='r')
ax1.hlines(y=0, xmin=1990, xmax=2030, ls='dashed', lw=0.5)
ax1.set_ylim(min(history3), max(history3))
ax1.set_xlim(min(prediction_years), max(prediction_years))

c = np.polyfit(Year, prof, 1)
print(c)
p = np.poly1d(c)
px = np.arange(1995,2030)
py = p(px)
ax1.plot(px, py, lw=0.5, ls='--', label="Linear Fit of Initial Trend")

ax1.grid(axis='x')
ax1.yaxis.set_major_formatter(formatter)
plt.legend()
plt.show()