https://www.transtats.bts.gov/Fields.asp?Table_ID=236

In [1]:
from __future__ import division #, print_function # Imports from __future__ since we're running Python 2

In [2]:
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [3]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
random_state = 0
%matplotlib inline
plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from datetime import datetime
from sklearn.ensemble import IsolationForest
from helpers.feature_engineering import dateStrToDayYear, getMappingOfSimilarCategoricalColumns, \
    compareSimilarCategoricalColumns 
from helpers.my_one_hot_encoder import MyOneHotEncoder
from helpers.py_helpers import is_number
from scipy.stats import skew, kurtosis
from helpers.outliers import MyOutliers
from sklearn.preprocessing import StandardScaler
from helpers.plot_helper import scatter_2d_label
from sklearn.decomposition import PCA # Import the PCA module
from sklearn.manifold import TSNE
from sklearn.decomposition import KernelPCA
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from helpers.performance_issues import subsample_keeping_class_proportions
from flights_delay.feature_processing import FlightDelayFeatureProcessing

In [4]:
rng = np.random.RandomState(seed=random_state)

# Flying to New York City - Use all training data

In [5]:
target_col = 'IS_DELAYED'

In [6]:
path_data = os.path.realpath(os.path.join(os.getcwd(), '../Data', 'train_data.csv'))
assert os.path.isfile(path_data)
path_data

'/home/student/pligor.george@gmail.com/msc_Artificial_Intelligence/dme_Data_Mining/dmedatarats/Data/train_data.csv'

In [7]:
df = pd.read_csv(path_data, delimiter = ',', header=0)
df.shape

(433495, 39)

In [8]:
df_proc = FlightDelayFeatureProcessing().process_all(df)
df_proc.shape

(433495, 539)

In [9]:
XX = df_proc.drop(labels=[target_col], axis=1)
XX.shape

(433495, 538)

In [10]:
yy = df_proc[target_col]

#### As a first approach we are NOT removing the outliers as we are going to let the isolation forest discover them

In [11]:
outlier_cols = np.array(['DEP_DELAY', 'DISTANCE'])

In [12]:
types = XX[outlier_cols].dtypes
for col in outlier_cols:
    types[col] = MyOutliers().getLooseBoundaries(XX[col], k=3.0)
bounds = types.copy()

bounds

DEP_DELAY    (-114.914854646, 140.381149451)
DISTANCE     (-2701.60422982, 4813.45740887)
dtype: object

In [13]:
outlierCounter = MyOutliers.countOutliersDataPoints(XX, bounds)
outlierCounter

DEP_DELAY    9993
DISTANCE      659
dtype: object

only a relatively small number of instances are being held as outliers

In [14]:
print "in fact only the {} % is being held as outliers".format(100 *np.sum(outlierCounter) / len(XX))

in fact only the 2.45723710769 % is being held as outliers


#### normalize some columns

In [15]:
not_hot_cols = [col for col in XX.columns if not is_number(col[-1])]
not_hot_cols

['QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'FL_NUM',
 'DEP_TIME',
 'DEP_DELAY',
 'DEP_DELAY_GROUP',
 'DISTANCE',
 'DISTANCE_GROUP',
 'YDAY']

In [16]:
Xss = XX.copy()

In [17]:
Xss[not_hot_cols] = StandardScaler().fit_transform(XX[not_hot_cols])

#### save

In [18]:
full_data = pd.concat( (Xss, yy), axis=1)
full_data.shape

(433495, 539)

In [19]:
full_data.to_csv(
    os.path.realpath(os.path.join(os.getcwd(), '../Data', 'train_data_numerical_normalized.csv')),
    index=False)