In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import pandas_profiling

In [None]:


df = pd.read_table('Airfares.txt', delim_whitespace=True,header=None,names=('City1','City2','Average Fare','Distance','Average weekly passengers','market leading airline','market share','Average_fare','Low price airline','market share','price'))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

We totally have 1000 rows and 11 columns

# Descriptive Statistics

In [None]:
df.describe().T

# Checking for Null Values

In [None]:
df.isnull().sum()


## Exploratory Data Analysis

In [None]:
##Top five cities

df.City1.value_counts()[0:5]

In [None]:
##Top five cities

df.City2.value_counts()[0:5]

In [None]:
sns.distplot( df['Average Fare']);

In [None]:
box = plt.boxplot(df['Average Fare']);

In [None]:
[item.get_ydata()[0] for item in box['caps']]

In [None]:
[item.get_ydata()[0] for item in box['whiskers']]

In [None]:
[item.get_ydata()[0] for item in box['medians']]

In [None]:
## Cities with average Fare above Maximum (Q3+1.5IQR)

df.loc[df['Average Fare'] > 299.31,['City1','City2']]

In [None]:
sns.distplot( df['Distance']);

  > Most of the distances lie between 500-1000

In [None]:
sns.distplot( df['Average weekly passengers']);

In [None]:
df['market leading airline'].value_counts()

In [None]:
sns.countplot(x="market leading airline",data=df,order = df['market leading airline'].value_counts().index)

## Inference :
    The top three Airlines are WN,DL and AA

In [None]:
sns.distplot( df['Average fare']);

In [None]:
df.info()

In [None]:
df['Low price airline'].value_counts()

In [None]:
sns.countplot(x="Low price airline",data=df,order = df['Low price airline'].value_counts().index)

In [None]:
box = plt.boxplot(df['price']);

In [None]:
sns.regplot( x = 'Average Fare',
y = 'Distance',
data =df );

In [None]:
numeric = df.select_dtypes(include=['float64', 'int64'])

In [None]:
numeric[0:10]

In [None]:
categorical = df.select_dtypes(include=['object'])

In [None]:
categorical[0:5]

In [None]:
# correlation matrix
cor = numeric.corr()
cor

In [None]:
# plotting correlations on a heatmap

# figure size
plt.figure(figsize=(16,8))

# heatmap
sns.heatmap(cor, cmap="YlGnBu", annot=True)
plt.show()

 >> From the above we can infer that Average fare has a good correlation with Distance

In [None]:
sns.pairplot(numeric, size=2)

# Checking the assumptions

In [None]:
df.info()

# Handling the object Variable

In [None]:
df_categorical=df[['City1','City2','market leading airline','Low price airline']]

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

In [None]:
# concat df_categorical with original df
df = df.drop(df_categorical.columns, axis=1)
df = pd.concat([df, df_categorical], axis=1)
df.head()

In [None]:
df.info()

# Define X and y variables

In [None]:
X =df.drop('Average Fare',axis=1)
y=df['Average Fare']

In [None]:
#Creating train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [90]:
import statsmodels.api as sm 
lm = sm.OLS(y, X).fit() 

In [91]:
print(lm.summary())

                                 OLS Regression Results                                
Dep. Variable:           Average Fare   R-squared (uncentered):                   0.997
Model:                            OLS   Adj. R-squared (uncentered):              0.997
Method:                 Least Squares   F-statistic:                          2.584e+04
Date:                Mon, 14 Oct 2019   Prob (F-statistic):                        0.00
Time:                        12:50:45   Log-Likelihood:                         -2526.9
No. Observations:                 700   AIC:                                      5074.
Df Residuals:                     690   BIC:                                      5119.
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                                coef    std err          t      P>|t|      [0.025      0.975]
--------------------------

## Inference

   >Auto corrrelation - Is present as Durbin-Watson is 1.75  no serial correlation
   Explanantion : When the test statistic equals 2 indicates no serial correlation. This statistic will always be between 0 and 4. The closer to 0 the statistic, the more evidence for positive serial correlation. The closer to 4, the more evidence for negative serial correlation
    >>The null hypothesis for the JB test is that the data is normally distributed; the alternate hypothesis is that the data does not come from a normal distribution.We reject Null hypothesis
     >>>No warnings related to Multicollinearity

# Check for  linearity- Rainbow Test
The Null hypothesis is that the regression is correctly modelled as linear. The alternative for which the power might be large are convex

In [92]:
sm.stats.diagnostic.linear_rainbow(lm, frac=0.5)

(1.038877274156965, 0.3619102487837604)

Rainbow test for linearity

The null hypothesis is the fit of the model using full sample is the same as using a central subset. The alternative is that the fits are difference. The rainbow test has power against many different forms of nonlinearity.

We fail to  reject Null Hypothesis


# Test for Multicollinearity 

In [93]:
# VIF - Variance Inflation Factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Values"]=[variance_inflation_factor(X.values, col) 
                   for col in range(0, X.shape[1])]

In [94]:
ce = ['City1', 'City2', 'market leading airline', 'Low price airline',
       'Distance', 'Average weekly passengers', 'market share', 'Average fare',
       'market share.1', 'price']
vif["Feature Names"] = ce

In [95]:
vif

Unnamed: 0,VIF Values,Feature Names
0,7.409693,City1
1,1.713743,City2
2,10.506674,market leading airline
3,38.405571,Low price airline
4,4.393042,Distance
5,40.379261,Average weekly passengers
6,3.645095,market share
7,6.487487,Average fare
8,3.65096,market share.1
9,4.253537,price


>Multicollinearity is present 
As the P- value of Average weekly passengers is greater than 0.05 and the VIF value is 40.37 we can drop the column

# Test of Heteroscedasticity - Goldfend Quandt test

1. Null Hypothesis - The Data is HOMOSCEDASTIC (Uniform Variance)
2. Alternate Hypothesis - Data is HETEROSCEDASTIC( Unequal Variance)

In [96]:
from statsmodels.stats.api import linear_rainbow, het_goldfeldquandt
het_goldfeldquandt(y, X)

(1.0705592256666911, 0.2253738232667984, 'increasing')

* We fail to reject null hypothesis
* The Data is HOMOSCEDASTIC

# Standardization  with z score

In [97]:
numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Average Fare                 1000 non-null float64
Distance                     1000 non-null int64
Average weekly passengers    1000 non-null float64
market share                 1000 non-null float64
Average fare                 1000 non-null float64
market share.1               1000 non-null float64
price                        1000 non-null float64
dtypes: float64(6), int64(1)
memory usage: 54.8 KB


In [98]:
from scipy.stats import zscore

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
Average Fare                 1000 non-null float64
Distance                     1000 non-null int64
Average weekly passengers    1000 non-null float64
market share                 1000 non-null float64
Average fare                 1000 non-null float64
market share.1               1000 non-null float64
price                        1000 non-null float64
City1                        1000 non-null int32
City2                        1000 non-null int32
market leading airline       1000 non-null int32
Low price airline            1000 non-null int32
dtypes: float64(6), int32(4), int64(1)
memory usage: 70.4 KB


In [100]:
numeric.drop(['Average Fare','Average weekly passengers'],axis=1,inplace=True)

In [101]:
numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
Distance          1000 non-null int64
market share      1000 non-null float64
Average fare      1000 non-null float64
market share.1    1000 non-null float64
price             1000 non-null float64
dtypes: float64(4), int64(1)
memory usage: 39.1 KB


In [102]:
numeric=numeric.apply(zscore)

In [103]:
numeric.head()

Unnamed: 0,Distance,market share,Average fare,market share.1,price
0,-0.822816,0.519752,-0.888051,1.315878,-0.681341
1,-0.306391,0.773393,-0.695488,-0.63956,-0.513776
2,-0.318835,0.969177,0.915432,-1.173465,0.506866
3,-1.196135,1.903153,-1.561383,2.304674,-1.574668
4,-0.519494,-1.050651,-0.084427,-0.709344,0.047175


In [104]:
df = df.drop(['Distance','Average weekly passengers','market share','Average fare','market share.1','price'], axis=1)

In [105]:
df.head()

Unnamed: 0,Average Fare,City1,City2,market leading airline,Low price airline
0,114.47,16,0,6,8
1,122.47,16,40,6,6
2,214.42,2,0,4,5
3,69.4,2,7,14,17
4,158.13,2,52,12,17


In [106]:
#concat numeric with original df
df = pd.concat([df,numeric], axis=1)

In [107]:
df.head()

Unnamed: 0,Average Fare,City1,City2,market leading airline,Low price airline,Distance,market share,Average fare,market share.1,price
0,114.47,16,0,6,8,-0.822816,0.519752,-0.888051,1.315878,-0.681341
1,122.47,16,40,6,6,-0.306391,0.773393,-0.695488,-0.63956,-0.513776
2,214.42,2,0,4,5,-0.318835,0.969177,0.915432,-1.173465,0.506866
3,69.4,2,7,14,17,-1.196135,1.903153,-1.561383,2.304674,-1.574668
4,158.13,2,52,12,17,-0.519494,-1.050651,-0.084427,-0.709344,0.047175


In [108]:
df.shape

(1000, 10)

# Building the Linear regression model

In [109]:
X =df.drop('Average Fare',axis=1)
y=df['Average Fare']

In [110]:
#Creating train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [111]:
lm = LinearRegression() # created the machine learning algo

In [112]:
# use the lm to train my dataset
lm.fit(X_train, y_train) # Model is fitted on the training data

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [113]:
# predict 
y_pred = lm.predict(X_test)

# metrics
from sklearn.metrics import r2_score

print(r2_score(y_true=y_test, y_pred=y_pred))



0.9757773730636001


In [114]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
print(mean_absolute_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

6.242952929484397
8.733923283788839


# Building an OLS

In [115]:
# Adding a Constant - Purpose | Helps in Calculation of Intercept
X = sm.add_constant(X)
# Building the Model
ins_model = sm.OLS(y,X).fit() # Fitting the Model on the Data
ins_pred = ins_model.predict(X) # Making Prediction on X values 

  return ptp(axis=axis, out=out, **kwargs)


In [116]:
# R Square Values
print(ins_model.rsquared)
print(ins_model.rsquared_adj)

0.9749116946944095
0.9746836191916314


In [117]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
print(mean_absolute_error(y, ins_pred))
print(np.sqrt(mean_squared_error(y, ins_pred)))

6.228433125900148
8.765110595448302


# Objective - Enhance the prediction power of the model

# Polynomial 

In [118]:
from sklearn.preprocessing import PolynomialFeatures

In [119]:
pf = PolynomialFeatures()

In [120]:
# Transformed the Input Variabes for a better modelling
X = pf.fit_transform(X)

In [121]:
# Building the Model
ins_model = sm.OLS(y,X).fit() # Fitting the Model on the Data
ins_pred = ins_model.predict(X) # Making Prediction on X values 
st_residual = ins_model.get_influence().resid_studentized_internal

In [122]:
# Find the Rsquare
print(ins_model.rsquared)
print(ins_model.rsquared_adj)

0.9887412538159424
0.988097896891139


In [123]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
print(mean_absolute_error(y, ins_pred))
print(np.sqrt(mean_squared_error(y, ins_pred)))

3.8537959248222857
5.871739116841181


#  Random Forest and  Bagging - Ensemble

In [124]:

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

In [125]:

rf = RandomForestRegressor()
bag = BaggingRegressor()

# Random Forest

In [128]:
# Random Forest
rf.fit(X,y)
pred_rf = rf.predict(X)
print(mean_absolute_error(y, pred_rf))
print(np.sqrt(mean_squared_error(y, pred_rf)))

2.1304750000000006
3.4128579084104866


# Bagging

In [129]:
# Bagging
bag.fit(X,y)
pred_bag = bag.predict(X)
print(mean_absolute_error(y, pred_bag))
print(np.sqrt(mean_squared_error(y, pred_bag)))

2.094168000000001
3.407420518515437


# Inference



The best model in terms of Mean Absolute error is as follows: Bagging-->Random forest-->Polynomial--->Linear

The most important feature for predicting Average Fare is Distance, Price which can be infered from the Pair Plot