In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Preparação dos dados

In [None]:
train_path = '/kaggle/input/mini-flight-delay-prediction/flight_delays_train.csv'
test_path = '/kaggle/input/mini-flight-delay-prediction/flight_delays_test.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df_train['DepHour'] = df_train['DepTime']/100
df_test['DepHour'] = df_test['DepTime']/100

labelenconder = LabelEncoder()

df_train['Month'] = labelenconder.fit_transform(df_train['Month'])
df_train['DayofMonth'] = labelenconder.fit_transform(df_train['DayofMonth'])
df_train['DayOfWeek'] = labelenconder.fit_transform(df_train['DayOfWeek'])
df_train['UniqueCarrier_ENC'] = labelenconder.fit_transform(df_train['UniqueCarrier'])
df_train['Dest'] = labelenconder.fit_transform(df_train['Dest'])
df_train['Origin'] = labelenconder.fit_transform(df_train['Origin'])

df_test['Month'] = labelenconder.fit_transform(df_test['Month'])
df_test['DayofMonth'] = labelenconder.fit_transform(df_test['DayofMonth'])
df_test['DayOfWeek'] = labelenconder.fit_transform(df_test['DayOfWeek'])
df_test['UniqueCarrier_ENC'] = labelenconder.fit_transform(df_test['UniqueCarrier'])
df_test['Dest'] = labelenconder.fit_transform(df_test['Dest'])
df_test['Origin'] = labelenconder.fit_transform(df_test['Origin'])

colunas = ['Month', 'DayofMonth', 'DayOfWeek', 'DepHour', 'DepTime', 'Origin', 'Dest', 'UniqueCarrier_ENC']

X = df_train[colunas]
y = df_train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 88)

labels = [0, 1]

#### Função para facilitar a visualização dos resultados

In [None]:
def plotConfusionMatrix(classifier,X_test,y_test,y_pred,class_names):
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    titles_options = [("Confusion matrix, without normalization", None)]
    for title, normalize in titles_options:
        disp = plot_confusion_matrix(classifier, X_test, y_test,display_labels=class_names,cmap=plt.cm.Blues)
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)

    plt.show()
    return ;

##### Nos classificadores a seguir, todos os parâmetros foram decididos por random search. No caso da Decision Tree, os parâmetros padrões já tinham resultados dentro da margem de erro dos melhores parâmetros.

#### Decision Tree

In [None]:
decisiontree = DecisionTreeClassifier() 
decisiontree.fit(X_train, y_train)

dt_pred = decisiontree.predict(X_test)

y_true, y_pred = y_test, decisiontree.predict(X_test)
print(classification_report(y_true, y_pred))
plotConfusionMatrix(decisiontree,X_test,y_test,y_pred,labels)

#### Random Forest

In [None]:
randomforest = RandomForestClassifier(n_estimators = 100, max_features = 'log2')
randomforest.fit(X_train, y_train)

rf_pred = randomforest.predict(X_test)

y_true, y_pred = y_test, randomforest.predict(X_test)
print(classification_report(y_true, y_pred))
plotConfusionMatrix(randomforest,X_test,y_test,y_pred,labels)

#### Gradient Boosting

In [None]:
gbk = GradientBoostingClassifier(n_estimators = 300, max_depth = 7, learning_rate = 0.2)
gbk.fit(X_train, y_train)

gb_pred = gbk.predict(X_test)

y_true, y_pred = y_test, gbk.predict(X_test)
print(classification_report(y_true, y_pred))
plotConfusionMatrix(gbk,X_test,y_test,y_pred,labels)

#### Aplicando todos os classificadores no conjunto de testes

In [None]:
dt_pred = decisiontree.predict(df_test[colunas])
rf_pred = randomforest.predict(df_test[colunas])
gb_pred = gbk.predict(df_test[colunas])

#### Combinando os resultados

In [None]:
resultados = []
for i in range(0, len(df_test)):
    if dt_pred[i] + rf_pred[i] + gb_pred[i] > 1:
        resultados.append('Y')
    else:
        resultados.append('N')
        
output = pd.read_csv(test_path)
output['dep_delayed_15min'] = resultados
output

In [None]:
output.to_csv('submission.csv', index=False)