In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import networkx as nx
# import clubear as cb
import chardet
import warnings
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import glob

warnings.filterwarnings('ignore') # 在整个Python脚本执行期间忽略所有的警告。

'''
Step 1: Load and Combine Data
Adjust the selected_years list in to include the years the user wishes to analyse.
对代码增加功能改进：
（1）一次性流式读取文件夹中的所有年份，将数据规模扩展至更成熟的大规模数据处理
（2）多种编码格式的支持（utf8，iso等等）
（3）对于无法识别的编码格式，使用chardet库来自动检测编码格式
(4)适当改变object数据类型的格式尝试解决df数据集规模过大导致的内存不足问题（解决完成）
'''
def load_flight_data(years,path_folder,encodings):
    data_frames = []
    for year in years:
        file_path = f"{year}.csv"
        file_path_data=os.path.join(path_folder,file_path)
        if os.path.exists(file_path_data):
            loaded = False
            for encoding in encodings:
                try:
                    #转换数据为unicode编码以及转换分类数据的程序
                    # df=pd.read_csv(file_path_data,encoding=encoding,dtype='unicode')
                    #convert object data type to category data type
                    # for col in df.select_dtypes(include=['object']).columns:
                    #     df[col]=df[col].astype('category')
                    # data_frames.append(df)
                    data_frames.append(pd.read_csv(file_path_data,encoding=encoding))
                    print("Year {} loaded successfully.".format(year))
                    loaded=True
                    break
                except Exception as e:
                    #在有出现除编码之外的异常编码时抛出自定义编码异常（但只是在encoding数组里面显示”不是这个编码，而不是显示“是什么错误编码”）
                    print(f"Warning: Failed to load file for year {year} with encoding {encoding}. Error: {e}")
            if not loaded:
                #用chardet检测异常编码的编码格式
                with open(file_path_data, 'rb') as f:
                    result = chardet.detect(f.read())
                    detected_encoding = result['encoding']
                    print(f"Error: File for year {year} could not be loaded with provided encodings. Detected encoding: {detected_encoding}")
                    # print(f"Warning: File for year {year} not found.")
        else:
            print(f"Warning: File for year {year} not found.")
    if data_frames:
        combined_data = pd.concat(data_frames, ignore_index=True)
        return combined_data
    else:
        print("No data loaded.")
        return pd.DataFrame()
    # Return an empty DataFrame if no data is loaded.
    # 翻译：如果没有加载数据，则返回一个空的DataFrame。

'''
修改部分1：将文件修改部署到文件夹的全局文件
'''
# file_paths = glob.glob('/path/to/csv/files/*.csv')  # 使用glob来获取所有CSV文件的路径
# datacombined = [pd.read_csv(file) for file in file_paths]
#读取和合并所有年份文件

# 合并所有CSV文件的数据
# data = pd.concat(data_frames, ignore_index=True)

# Choose the years you want to load.
#如何从csv文件中提取对应的年份，并且加入到下面的数组中：
current_folder=os.getcwd()
source_foder_init='MLDataset'
source_foder_path=os.path.join(current_folder,source_foder_init)




selected_years = [2004,2005]

#改进：拼接所有年份
# for filename in os.listdir(source_foder_path):
#     if filename.endswith('.csv'):
#         year = filename.split('.')[0]
#         selected_years.append(int(year))
        # print("Year {} added to the list.".format(year))
        

print("Selected years: ", selected_years)


  # It can be replaced with the desired years.

# Load the data for the selected years.
#改进：大数据适应性：调整在不同编码方式下的数据读取
encodings=['ISO-8859-1','utf-8','GBK','cp1252','ISO-8859-2','utf-16','utf-32',]
combined_flight_data = load_flight_data(selected_years,source_foder_path,encodings)


# Basic data exploration
print(combined_flight_data.shape)
combined_flight_data.info()


In [None]:


'''
Step 2: Clean Data 此处根据清洗过后的数据集来分析
'''
# Remove duplicate rows
#删除数据列中重复出现的行（但是不是有可能并没有必要，因为此类航空数据可能重复的行不多）
combined_flight_data.drop_duplicates(inplace=True)

# Handle missing values
#进行最简单处理，将整个数据集中缺失值替换成0.。
cleaned_combined_flight_data = combined_flight_data.fillna(0)

# Replace airline codes with full names
'''
把列中的uniquecarrier换成全称。
'''
airline_names = {
    'UA': 'United Airlines', 'US': 'United States Airways', 'WN': 'Southwest Airlines',
    'NW': 'Northwest Airlines', 'OH': 'PSA Airlines', 'OO': 'SkyWest Airlines',
    'XE': 'Expressjet Airlines', 'TZ': 'Air Tazania Airlines', 'DL': 'Delta Airlines',
    'EV': 'Atlantic Southeast Airlines', 'FL': 'Florida Airlines', 'HA': 'Hawaiian Airlines',
    'HP': 'America West Airlines', 'MQ': 'Envoy Airlines', 'AA': 'American Airlines',
    'AS': 'Alaska Airlines', 'B6': 'JetBlue Airways', 'CO': 'Continental Airlines',
    'DH': 'Indepedence Airlines', 'F9': 'Frontier Airlines'
}
cleaned_combined_flight_data['UniqueCarrier'].replace(airline_names, inplace=True)

# Export cleaned data to CSV
cleaned_combined_flight_data.to_csv('cleaned_combined_flight_data.csv', index=False)


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import networkx as nx
import chardet
import warnings
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

'''
Step 3: Feature Engineering
'''
# Create new features
#这里的做法大概是想要将设计时间这一列从12h制转换为24h制，以便于更好的分析数据。然后将24h的数据添加到最后一行里

cleaned_combined_flight_data=pd.read_csv('cleaned_combined_flight_data.csv')


cleaned_combined_flight_data['DepTime'] = cleaned_combined_flight_data['DepTime'].astype(float)
bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400]
labels = ["00:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00", "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", "13:00", "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00", "21:00", "22:00", "23:00"]
cleaned_combined_flight_data['24HoursTime'] = pd.cut(cleaned_combined_flight_data['DepTime'], bins=bins, labels=labels, include_lowest=True)

# show1=cleaned_combined_flight_data['24HoursTime'].head(5)

# print(show1)

#注意此处bins的标签数必须比labels多一个。因为在这里的数据区分中bin实际上才是实际的区间划分的函数。
#而labels只是用来标注每个区间的标签而已。所以一个label往往夹在bin的两个区间之间作为标签。所以这么看来bin至少要比label多一个。

In [2]:

'''
Step 4: Train Machine Learning Models
改进：（1）修正了这里分类数据独热化，再选取独热化的行作为特征的流程，使其能正确运行
'''
# Prepare data for training
#此处遇到的问题是标签被之前转换的变量变成了多个列变量。所以可能需要在这里用转换分类变量的方式把标签变量转换为分类变量次啊能get
#dummies训练了。
#那么由于feature的性质问题，feature在这里干脆定义为反选
#可能这里只能用传统的数据筛选的方法来选择特征了。即挑出特定几列作为target或者drop特定几列。

# 先将数据列中的object数据类型转换为category数据类型
# for col in cleaned_combined_flight_data.select_dtypes(include=['object']).columns:
#     cleaned_combined_flight_data[col]=cleaned_combined_flight_data[col].astype('category')
    
print(cleaned_combined_flight_data.columns.tolist())

unique_carrier_cols = cleaned_combined_flight_data.filter(like='UniqueCarrier_').columns.tolist()
orign_cols = cleaned_combined_flight_data.filter(like='Origin_').columns.tolist()
dest_cols = cleaned_combined_flight_data.filter(like='Dest_').columns.tolist()

transcols=unique_carrier_cols+orign_cols+dest_cols
    
# Convert categorical features to numerical values
# 相当于将分类数据变成了独热编码的虚拟变量。故这个独热化应该是要先处理的。之后才能是特征选取
cleaned_combined_flight_data = pd.get_dummies(cleaned_combined_flight_data, columns=transcols)
    
# features = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 
#             'FlightNum', 'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 
#             'ArrDelay', 'DepDelay', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 
#             'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 
#             'SecurityDelay', 'LateAircraftDelay','24HoursTime']
features=cleaned_combined_flight_data.drop(columns=['DepDelay']).columns.tolist()

print(features)
# 'UniqueCarrier',
# 'Origin', 'Dest',
target = 'DepDelay'



# print([col for col in cleaned_combined_flight_data.columns])

# print(cleaned_combined_flight_data.head())
# print(cleaned_combined_flight_data.info())






# feature_columns=unique_carrier_cols+orign_cols+dest_cols
# print(feature_columns)

# features = cleaned_combined_flight_data[feature_columns]



# features=feature_columns+features
# print(features)







# # 获取虚拟变量列名
# dummy_columns = [col for col in cleaned_combined_flight_data.columns if col.startswith('UniqueCarrier_') or col.startswith('Origin_') or col.startswith('Dest_')]

# # 更新 features 列表
# features = [feature for feature in features if feature not in ['UniqueCarrier', 'Origin', 'Dest']] + dummy_columns


['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', '24HoursTime']
['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', '24HoursTime']


In [3]:
'''
改进：（1）舍弃了不必要的分类数据TailNum和24H（这个只是在后面作为对比的列），使得我们能够正确选取训练集X的特征
（2）在上面的模型中，不知道为什么Unique等分类列没有在get-dummies下变成独热编码的函数。这里在训练集中重新对其进行
了独特编码，使得模型能正确进入到随机森林模型当中。


问题：（1）在解决了特征选取等一系列问题之后，随机森林模型根本无法进行训练，并不清楚是模型本身的性能问题/计算机性能的问题，还是
数据处理的问题
'''


# print(cleaned_combined_flight_data.dtypes)

X = cleaned_combined_flight_data[features]
X=X.drop(columns=['TailNum','24HoursTime'])
# print(X.head())
print(X.columns.tolist())
# for col in X.select_dtypes(include=['category']).columns:
#     X[col]=pd.get_dummies(X,columns=[col])
X = pd.get_dummies(X, columns=['UniqueCarrier', 'Origin', 'Dest','CancellationCode'])
print(X.columns.tolist())
print(X.dtypes.tolist())
#尝试在这里将Unique等三列转换为独热编码？
y = cleaned_combined_flight_data[target]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
#目前挺进到这里的数据转型和处理阶段
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Save the model to a pkl file for server deployment
with open('flight_delay_model.pkl', 'wb') as file:
    pickle.dump(model, file)


['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'FlightNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'UniqueCarrier_Air Tazania Airlines', 'UniqueCarrier_Alaska Airlines', 'UniqueCarrier_America West Airlines', 'UniqueCarrier_American Airlines', 'UniqueCarrier_Atlantic Southeast Airlines', 'UniqueCarrier_Continental Airlines', 'UniqueCarrier_Delta Airlines', 'UniqueCarrier_Envoy Airlines', 'UniqueCarrier_Expressjet Airline

In [None]:


'''
Step 5: Analysis and Visualisation
'''
# Best times to fly to minimise delays
best_time_of_day = cleaned_combined_flight_data.groupby("24HoursTime")["DepDelay"].sum()

# Plot the variation of delay time at different times of the day
plt.figure(figsize=(15, 9))
plt.plot(best_time_of_day, marker="o")
plt.title("Variation of Delay Time at Different Times of the Day", fontsize=18)
plt.xlabel("Time of the Day", fontsize=15)
plt.ylabel("Total Delay Minutes", fontsize=15)
plt.ticklabel_format(style="plain", axis="y")
plt.show()

# Best day of the week to avoid delays
best_day_of_week = cleaned_combined_flight_data.groupby("DayOfWeek")["DepDelay"].sum()

# Plot the variation of delay time for different days in a week
plt.figure(figsize=(15, 8))
plt.plot(best_day_of_week, color="green", marker="o")
plt.title("Variation of Delay Time for Different Days in a Week", fontsize=18)
plt.xlabel("Day of the Week", fontsize=15)
plt.ylabel("Total Delay Minutes", fontsize=15)
plt.ticklabel_format(style="plain", axis="y")
days = [1, 2, 3, 4, 5, 6, 7]
days_name = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
ax = plt.subplot()
ax.set_xticks(days)
ax.set_xticklabels(days_name, fontsize=13)
plt.show()

# Best day of the month to avoid delays
best_day_of_month = cleaned_combined_flight_data.groupby("DayofMonth")["DepDelay"].sum()

# Plot the variation of delay time for different days of the month
plt.figure(figsize=(15, 9))
plt.plot(best_day_of_month, color="green", marker="o")
plt.title("Variation of Delay Time for Different Days of the Month", fontsize=18)
plt.xlabel("Day of the Month", fontsize=15)
plt.ylabel("Total Delay Minutes", fontsize=15)
plt.ticklabel_format(style="plain", axis="y")
days_of_month = range(1, 32)
ax = plt.subplot()
ax.set_xticks(days_of_month)
plt.show()

# Best month of the year to avoid delays
best_month_of_year = cleaned_combined_flight_data.groupby("Month")["DepDelay"].sum()

# Plot the variation of delay time for different months in a year
plt.figure(figsize=(15, 8))
plt.plot(best_month_of_year, color="green", marker="o")
plt.title("Variation of Delay Time for Different Months in a Year", fontsize=18)
plt.xlabel("Months of the Year", fontsize=15)
plt.ylabel("Total Delay Minutes", fontsize=15)
plt.ticklabel_format(style="plain", axis="y")
month_names = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
months = range(1, 13)
ax = plt.subplot()
ax.set_xticks(months)
ax.set_xticklabels(month_names, fontsize=13)
plt.show()


'''
Step 6: Correlation Analysis
'''
# Correlation between aircraft age and delays
airlines = pd.concat(map(pd.read_csv, ['2004.csv', '2005.csv']))
airports = pd.read_csv('airports.csv')
planes = pd.read_csv('plane-data.csv')

# Clean and preprocess data
airlines = airlines.dropna(subset=['TailNum'])
planes = planes.dropna(subset=['year'])
planes = planes.rename(columns={'year': 'ManufactureYear'})
airlines = airlines.merge(planes[['tailnum', 'ManufactureYear']], left_on='TailNum', right_on='tailnum', how='left')

# Create delay indicator
airlines['ADelay'] = np.where(airlines['ArrDelay'] > 0, 1, 0)

# Group by manufacture year and delay
df_planes_grouped = airlines.groupby(['ManufactureYear', 'ADelay']).size().reset_index(name='Counts')
df_planes_grouped['TotalFlights'] = airlines.groupby('ManufactureYear').size().values
df_planes_grouped['DelayPercentage'] = (df_planes_grouped['Counts'] / df_planes_grouped['TotalFlights']) * 100

# Plot delay percentage by manufacture year
df_planes_grouped = df_planes_grouped[df_planes_grouped['ADelay'] == 1]
plt.figure(figsize=(15, 8))
plt.plot(df_planes_grouped['ManufactureYear'], df_planes_grouped['DelayPercentage'], color='green', marker='o')
plt.title('Percentage of Delays by Aircraft Age', fontsize=18)
plt.xlabel('Year of Manufacture', fontsize=15)
plt.ylabel('Delay Percentage', fontsize=15)
plt.show()


In [None]:


'''
Step 7: Integrate with Flask Application
'''
from flask import Flask, request, jsonify
import pandas as pd
import pickle

app = Flask(__name__)

# Load the model
model = pickle.load(open('flight_delay_model.pkl', 'rb'))

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    prediction = model.predict(pd.DataFrame([data]))
    return jsonify(prediction=prediction[0])

if __name__ == '__main__':
    app.run(port=5000, debug=True)