In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('project').getOrCreate()

In [3]:
airline_df = spark.read.csv('/FileStore/tables/2008_csv-db05f.bz2', inferSchema=True, header=True)

In [4]:
airline_df.show()

In [5]:
airline_df_c = airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode')

In [6]:
airline_df_c=airline_df_c.groupBy('CancellationCode', 'Origin', 'Dest').count().sort('count')

In [7]:
airline_df_c.show()

In [8]:
#Run correlation of cancellation column with other columns

In [9]:
airline_df_c1=airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode').groupBy('CancellationCode').count().sort('count')

In [10]:
airline_df_c1.show()

In [11]:
#As seen above, majority of cancellations do not have a reason code attached. Therefore, it is difficult to know the reason for cancellation. We have to figure out using various other features in the dataset.

In [12]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode').groupBy('Origin').count().sort('count', ascending=False).show()

In [13]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode').groupBy('Dest').count().sort('count', ascending=False).show()

In [14]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode').groupBy('Origin', 'Cancelled').sum('Cancelled').sort('sum(cancelled)', ascending=False).show()

In [15]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode').groupBy('UniqueCarrier', 'Cancelled').sum('Cancelled').sort('sum(cancelled)', ascending=False).show()

In [16]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode').groupBy('Dest', 'Cancelled').sum('Cancelled').sort('sum(cancelled)', ascending=False).show()

In [17]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode', 'DayofWeek', 'DepTime').groupBy('DayofWeek', 'Cancelled').sum('Cancelled').sort('sum(cancelled)', ascending=False).show()

In [18]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode', 'DayofWeek', 'DepTime').groupBy('DepTime', 'Cancelled').sum('Cancelled').sort('sum(cancelled)', ascending=False).show()  

In [19]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode', 'DayofWeek', 'DepTime','Month', 'DayofMonth').groupBy('Month', 'Cancelled').sum('Cancelled').sort('sum(cancelled)', ascending=False).show()  

In [20]:
airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode', 'DayofWeek', 'DepTime','Month', 'DayofMonth').groupBy('DayofMonth', 'Cancelled').sum('Cancelled').sort('sum(cancelled)', ascending=False).show()  

In [21]:
#Null Hypothesis:
#1. Cancellations happened due to factors attributed to an airport
#2. Cancellations happened due to factors attributed to an airline
#3. Cancellations happened due to factors attributed to 

In [22]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml import pipeline
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import TrainValidationSplit

In [23]:
airline_df_train, airline_df_test = airline_df.select('UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode', 'DayofWeek', 'DepTime','Month', 'DayofMonth').randomSplit([0.7,0.3], seed=100)

In [24]:
assembler = VectorAssembler(inputCols=['UniqueCarrier', 'Origin', 'Dest', 'Cancelled', 'CancellationCode', 'DayofWeek', 'DepTime','Month', 'DayofMonth'], outputCol='features')

In [25]:
airline_df_train_v=assembler.transform(airline_df_train)

In [26]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [27]:
lrModel = lr.fit(airline_df_train)

In [28]:
#https://spark.apache.org/docs/2.1.1/ml-classification-regression.html#binomial-logistic-regression
#https://docs.pymc.io/notebooks/GLM-negative-binomial-regression.html
#https://data.library.virginia.edu/getting-started-with-negative-binomial-regression-modeling/
#https://docs.pymc.io/history.html
#https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4
#https://stackoverflow.com/questions/46710934/pyspark-sql-utils-illegalargumentexception-ufield-features-does-not-exist/46729342