<a href="https://colab.research.google.com/github/shicong621/Colab/blob/main/Copy_of_Expedia_MLS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import datetime, warnings, scipy 

In [None]:
pip install xgboost
pip install scikit-learn

In [None]:
train = pd.read_csv("train.csv")
test = pd.test_csv("test.csv")

In [None]:
train.info()

In [None]:
#train['DATE'] = pd.to_datetime(train[['dep_date','arr_date']])
train['day_of_weeks_arr'] = train['arr_date'].dt.day_name()
train['Month'] = train['arr_date'].dt.month

In [None]:

# Function that convert the 'HHMM' string to datetime.time
def format_heure(chaine):
    if pd.isnull(chaine):
        return np.nan
    else:
        if chaine == 2400: chaine = 0
        chaine = "{0:04d}".format(int(chaine))
        heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
        return heure
#_____________________________________________________________________
# Function that combines a date and time to produce a datetime.datetime
def combine_date_heure(x):
    if pd.isnull(x[0]) or pd.isnull(x[1]):
        return np.nan
    else:
        return datetime.datetime.combine(x[0],x[1])
#_______________________________________________________________________________
# Function that combine two columns of the dataframe to create a datetime format
def create_flight_time(df, col):    
    liste = []
    for index, cols in df[['DATE', col]].iterrows():    
        if pd.isnull(cols[1]):
            liste.append(np.nan)
        elif float(cols[1]) == 2400:
            cols[0] += datetime.timedelta(days=1)
            cols[1] = datetime.time(0,0)
            liste.append(combine_date_heure(cols))
        else:
            cols[1] = format_heure(cols[1])
            liste.append(combine_date_heure(cols))
    return pd.Series(liste)


In [None]:
df['dep_date'] = create_flight_time(df, 'dep_date')
df['DEPARTURE_TIME'] = df['DEPARTURE_TIME'].apply(format_heure)
df['arr_date'] = df['arr_date'].apply(format_heure)
df['ARRIVAL_TIME'] = df['ARRIVAL_TIME'].apply(format_heure)
#__________________________________________________________________________
df.loc[:5, ['dep_date', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME',
             'arr_date', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']]

In [None]:
variables_to_remove = ['TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 
                       'MONTH','DAY','DAY_OF_WEEK','DATE', 'AIR_SYSTEM_DELAY',
                       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
                       'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
                       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIR_TIME']
df.drop(variables_to_remove, axis = 1, inplace = True)
df = df[['ac_code', 'dep_stn', 'arr_stn',
        'dep_date', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
        'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
        'SCHEDULED_TIME', 'ELAPSED_TIME']]
df[:5]

In [None]:
# missing value
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

In [None]:
df.dropna(inplace = True)
# flights_data=flights_data.fillna(flights_data.mean()) 

In [None]:
from sklearn import preprocessing

#label encoding
label_columns = ['weather', 'dep_country', 'arr_country','dep_stn', 'arr_stn']
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
for col in label_columns:
  df[col]= label_encoder.fit_transform(df[col])


#one hot encoding
one_hot_columns = []
oh_encoder = prepreocessing.OneHotEncoder()
enc_data = pd.DataFrame(enc.fit_transform(df[[one_hot_columns]]).toarray())
New_df=df.join(enc_data)



In [None]:
### New feature
# airline_code -> first two character in ac_code
df['airline_code'] = df['ac_code'].astype(str).str[:2]


###month
df['month'] = df['dep_date'].dt.month

###year
df['year'] = df['dep_date'].dt.year

###day name
df['weekday'] = df['dep_date'].dt.day_name()

###holiday?
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

dr = pd.to_datetime(df['dep_date']).dt.date

#dr = pd.date_range(start='2015-07-01', end='2015-07-31')
#df = pd.DataFrame()
df['date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

df['holiday'] = df['date'].isin(holidays)

### segment into day and night? 


##Relation between the weather and delays

In [None]:
print("Weather type: {}".format(len(train['weather'].unique())))

In [None]:
def plot_bar(group, title):
    plt.figure(figsize=(14,6))
    sns.barplot(x=group.index,y=group.values)     
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

In [None]:
plot_bar(train.value_counts('weather'), 'Travel Frequency Month Wise')

In [None]:
colors = ['royalblue', 'grey', 'wheat', 'c', 'firebrick', 'seagreen', 'lightskyblue',
          'lightcoral', 'yellowgreen', 'gold', 'tomato', 'violet', 'aquamarine', 'chartreuse']
ax3 = sns.stripplot(y="weather", x="DEPARTURE_DELAY", size = 4, palette = colors,
                    data = train, linewidth = 0.5,  jitter=True)
plt.setp(ax3.get_xticklabels(), fontsize=14)
plt.setp(ax3.get_yticklabels(), fontsize=14)
ax3.set_xticklabels(['{:2.0f}h{:2.0f}m'.format(*[int(y) for y in divmod(x,60)])
                         for x in ax3.get_xticks()])
plt.xlabel('Departure delay', fontsize=18, bbox={'facecolor':'midnightblue', 'pad':5},
           color='w', labelpad=20)
ax3.yaxis.label.set_visible(False)
#________________________
plt.tight_layout(w_pad=3) 

##Relation between tht date and delays

In [None]:
##这里不需要划分的
delay = []
for row in train['ARRIVAL_DELAY']:
    if row > 60:
        delay.append(3)
    elif row > 30:
        delay.append(2)
    elif row > 15:
        delay.append(1)
    else:
        delay.append(0)  
train['delay'] = delay

In [None]:
def plot_bar(group, title):
    plt.figure(figsize=(14,6))
    sns.barplot(x=group.index,y=group.values)     
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()
plot_bar(train.value_counts('DAY'), 'Travel Frequency Month Wise')
plot_bar(train.value_counts('MONTH'), 'x')
plot_bar(train.value_counts('DAY_OF_WEEK'), 'Travel Frequency Month Wise')

## Correlation Plot

In [None]:
sns.jointplot(data=train, x="SCHEDULED_ARRIVAL", y="ARRIVAL_TIME")

In [None]:
sns.jointplot(data=train, y="AIRLINE", x="delay")

## Model

In [None]:
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [None]:
y_train = train['delay']
x_train = train.drop(['delay'], axis=1)
x_test = test.drop(['delay'], axis=1)
y_test = test['delay']

In [None]:
regressor=xgb.XGBRegressor(eval_metric='mape')

from sklearn.model_selection import GridSearchCV
# set up our search grid
param_grid = {"max_depth":    [4, 5, 6],
              "n_estimators": range(50, 400, 50),
              "learning_rate": [0.01, 0.015]}

# try out every combination of the above values
search = GridSearchCV(regressor, param_grid, cv=5).fit(x_train, y_train)

print("The best hyperparameters are ",search.best_params_)

In [None]:
regressor=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
                           n_estimators  = search.best_params_["n_estimators"],
                           max_depth     = search.best_params_["max_depth"],
                           eval_metric='mape')

regressor.fit(x_train, y_train)

predictions = regressor.predict(x_test)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(y_test, predictions)
print("The score is %.5f" % mape)

## Feature importance

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 16})

fig, ax = plt.subplots(figsize=(12,6))
plot_importance(regressor, max_num_features=8, ax=ax)
plt.show();

In [None]:
output = pd.DataFrame({"Id":test.index, "DelayTime":predictions})
output.to_csv('sample_submission.csv', index=False)