In [133]:
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier

#### note that we are only keeping the years between 1924 to 1999 to avoid confusion between years

In [217]:
#drop unwanted columns
df = pd.read_csv("planecrashes-manu-fau.csv").drop(columns = ["Location / Operator", "Manufacturer 1", "Manufacturer 2", "Aircraft Type 2", "Occupants"])

#only keep year in the date column
df["Date"] = df["Date"].str[-2:]

#convert to numeric and dropna
df["Occupant Fatalities"] = pd.to_numeric(df["Occupant Fatalities"], errors='coerce')
df["Ground Fatalities"] = pd.to_numeric(df["Ground Fatalities"], errors='coerce')
df = df.dropna()

#create total fatalities
df["Total Fatalities"] = df["Occupant Fatalities"] + df["Ground Fatalities"]

#drop unwanted columns
df = df.drop(columns=["Occupant Fatalities", "Ground Fatalities"])

#drop unwanted years
df["Date"] = pd.to_numeric(df["Date"]) #ensure they are int values
df = df.loc[df['Date'].between(24, 99)]

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Total Fatalities"] = df["Occupant Fatalities"] + df["Ground Fatalities"]


Unnamed: 0,Date,Aircraft Type 1,Total Fatalities
56,24,de havilland dh-4,1.0
57,24,fokker f.iii,3.0
58,24,junkers f-13,5.0
59,24,de havilland dh-4,1.0
60,24,breguet 14,1.0
...,...,...,...
4225,99,boeing 747-2b5f,4.0
4226,99,airbus a300b2-101,1.0
4227,99,yakovlev yak-42d,22.0
4228,99,de havilland canada dhc-6 twin otter,10.0


In [218]:
#create empty unique lists
unique_aircraft = df["Aircraft Type 1"].unique()
unique_fatalities = df["Total Fatalities"].unique()
years = df["Date"].unique()

#add "c." to each year so it does not get confused with fatalities
unique_years = []
for year in years:
    unique_years.append("c." + str(year))
    

print(unique_years)
print("unique aircraft: " + str(len(unique_aircraft)))
print("unique fatalities: " + str(len(unique_fatalities)))

['c.24', 'c.25', 'c.26', 'c.27', 'c.28', 'c.29', 'c.30', 'c.31', 'c.32', 'c.33', 'c.34', 'c.35', 'c.36', 'c.37', 'c.38', 'c.39', 'c.40', 'c.41', 'c.42', 'c.43', 'c.44', 'c.45', 'c.46', 'c.47', 'c.48', 'c.49', 'c.50', 'c.51', 'c.52', 'c.53', 'c.54', 'c.55', 'c.56', 'c.57', 'c.58', 'c.59', 'c.60', 'c.61', 'c.62', 'c.63', 'c.64', 'c.65', 'c.66', 'c.67', 'c.68', 'c.69', 'c.70', 'c.71', 'c.72', 'c.73', 'c.74', 'c.75', 'c.76', 'c.77', 'c.78', 'c.79', 'c.80', 'c.81', 'c.82', 'c.83', 'c.84', 'c.85', 'c.86', 'c.87', 'c.88', 'c.89', 'c.90', 'c.91', 'c.92', 'c.93', 'c.94', 'c.95', 'c.96', 'c.97', 'c.98', 'c.99']
unique aircraft: 1826
unique fatalities: 184


In [219]:
#create the columns
column_list = ["aircraft"]
column_list = column_list + unique_years + unique_fatalities.tolist()

matrix = pd.DataFrame(index=df.index.tolist(), columns=column_list, data=0) #create the empty DataFrame

matrix

Unnamed: 0,aircraft,c.24,c.25,c.26,c.27,c.28,c.29,c.30,c.31,c.32,...,160.0,227.0,189.0,230.0,349.0,228.0,234.0,203.0,229.0,217.0
56,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [220]:
#iterate through the original dataframe
for index, row in df.iterrows():
    
    #find the aircraft and set it in the matrix
    matrix.loc[index, "aircraft"] = row["Aircraft Type 1"]
    
    #create and find the date, add 1
    date = "c." + str(row["Date"])
    matrix.loc[index, date] =+ 1
    
    #find fatalities, add 1
    matrix.loc[index, row["Total Fatalities"]] =+ 1
    
matrix

Unnamed: 0,aircraft,c.24,c.25,c.26,c.27,c.28,c.29,c.30,c.31,c.32,...,160.0,227.0,189.0,230.0,349.0,228.0,234.0,203.0,229.0,217.0
56,de havilland dh-4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57,fokker f.iii,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58,junkers f-13,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,de havilland dh-4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,breguet 14,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,boeing 747-2b5f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4226,airbus a300b2-101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4227,yakovlev yak-42d,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4228,de havilland canada dhc-6 twin otter,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


the follwing is modified code from professor cody buntain

In [221]:
crash_indexes = matrix.index.tolist() #create a list of crash indexes

split_index = int(0.7 * len(crash_indexes)) #get a number that represents and 80% split

random.shuffle(crash_indexes) #shuffle them!

train_crash_indexes = crash_indexes[:split_index]
test_crash_indexes = crash_indexes[split_index:]

print("train data:", len(train_crash_indexes))
print("test data:", len(test_crash_indexes))

train data: 2893
test data: 1241


In [222]:
x = matrix.drop(columns=["aircraft"]) #drop columns that don't have aircraft
x.columns = x.columns.astype(str) #ensure the columns are strings
x


Unnamed: 0,c.24,c.25,c.26,c.27,c.28,c.29,c.30,c.31,c.32,c.33,...,160.0,227.0,189.0,230.0,349.0,228.0,234.0,203.0,229.0,217.0
56,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [223]:
x_train = x.loc[train_crash_indexes] #locate training data for x
x_test = x.loc[test_crash_indexes] #locate test data for x

In [224]:
y = matrix["aircraft"] #use the aircraft column
y


56                         de havilland dh-4
57                              fokker f.iii
58                              junkers f-13
59                         de havilland dh-4
60                                breguet 14
                        ...                 
4225                         boeing 747-2b5f
4226                       airbus a300b2-101
4227                        yakovlev yak-42d
4228    de havilland canada dhc-6 twin otter
4229                              antonov 28
Name: aircraft, Length: 4134, dtype: object

In [225]:
y_train = y.loc[train_crash_indexes] #locate training data for y
y_test = y.loc[test_crash_indexes] #locate test data for y

#### Decision Tree Time!

In [226]:
#create the tree and fit the model
model = DecisionTreeClassifier(max_depth=12)
model.fit(x_train, y_train)

In [227]:
y_predict = model.predict(x_test) #create predictions

In [228]:
y_test.value_counts()

aircraft
douglas dc-3                                90
douglas c-47a                               26
douglas c-47                                22
de havilland canada dhc-6 twin otter 300    19
yakovlev yak-40                             16
                                            ..
sikorsky s-61l helicopter                    1
britten-norman bn-2a-27 islander             1
de havilland comet 4b                        1
ilyushin il-62m                              1
stearman m-2 speedmail                       1
Name: count, Length: 736, dtype: int64

In [229]:
y_predict_df = pd.DataFrame(y_predict, index=test_crash_indexes, columns=["aircraft"])
y_predict_df.value_counts()

aircraft                            
douglas dc-3                            1201
douglas c-47                              12
convair cv-240                             3
avro anson                                 3
fairchild c-119c-15-fa flying boxcar       2
douglas dc-6b                              2
de havilland canada dhc-3 otter            2
ilyushin il-12                             2
curtiss c-46-f-1-cu                        1
curtiss c-46a                              1
bristol 170 freighter 31                   1
aero commander ac 520                      1
douglas c-47b                              1
douglas c-54g (dc-4)                       1
bristol 170 freighter                      1
douglas dc-3 (c-47b-dk)                    1
douglas dc-3a                              1
douglas dc-6                               1
boeing b-29                                1
avro 688 tudor 4b                          1
handley page hp-81 hermes iv               1
?                 

In [231]:
y_predict_df

Unnamed: 0,aircraft
4225,douglas dc-3
446,douglas dc-3
2332,douglas dc-3
1514,douglas dc-3
2354,douglas dc-3
...,...
1874,douglas dc-3
2214,douglas dc-3
178,douglas dc-3
948,douglas dc-3


In [232]:
y_predict_df["correct"] = [row["aircraft"] in matrix.loc[index, "aircraft"] for index, row in y_predict_df.iterrows()]

In [233]:
y_predict_false_df = y_predict_df[y_predict_df['correct'] == False]
y_predict_false_df

Unnamed: 0,aircraft,correct
4225,douglas dc-3,False
446,douglas dc-3,False
2332,douglas dc-3,False
1514,douglas dc-3,False
2354,douglas dc-3,False
...,...,...
1075,douglas dc-3,False
66,douglas dc-3,False
1874,douglas dc-3,False
2214,douglas dc-3,False


In [238]:
y_predict_true_df = y_predict_df[y_predict_df['correct'] == True]
y_predict_true_df

Unnamed: 0,aircraft,correct
1242,douglas dc-3,True
1566,douglas dc-3,True
3941,douglas dc-3,True
1187,douglas dc-3,True
1668,douglas dc-3,True
...,...,...
1332,douglas dc-3,True
1372,douglas dc-3,True
1062,douglas dc-3,True
948,douglas dc-3,True


In [263]:
#find a sample of wrong predictions that are a dc-3
y_predict_false_df.tail(10)

Unnamed: 0,aircraft,correct
2195,douglas dc-3,False
3477,douglas dc-3,False
1901,douglas dc-3,False
3380,douglas dc-3,False
3154,douglas dc-3,False
1075,douglas dc-3,False
66,douglas dc-3,False
1874,douglas dc-3,False
2214,douglas dc-3,False
178,douglas dc-3,False


In [257]:
#find wrong predictions that are not a dc-3
for index, crash in y_predict_false_df.iterrows():
    
    if crash["aircraft"] != "douglas dc-3":
        print(index, crash["aircraft"])
    

1302 douglas dc-3a
1220 ilyushin il-12
1119 avro anson
715 douglas c-47
1619 convair cv-240
1600 de havilland canada dhc-3 otter
703 douglas c-47
1430 douglas dc-6
685 douglas c-47
1416 douglas c-47
1596 convair cv-240
704 douglas c-47
1598 convair cv-240
727 douglas c-47
1176 curtiss c-46-f-1-cu
1307 handley page hp-81 hermes iv
1275 bristol 170 freighter
983 douglas dc-3 (c-47b-dk)
731 douglas c-47
1641 douglas c-47b
1752 aero commander ac 520
1328 douglas c-54g (dc-4)
1115 avro 688 tudor 4b
682 douglas c-47
1250 ilyushin il-12
1270 curtiss c-46a
726 douglas c-47
1221 ?
1624 de havilland canada dhc-3 otter
1227 fairchild c-119c-15-fa flying boxcar
1203 fairchild c-119c-15-fa flying boxcar
996 avro anson
1448 douglas dc-6b
1639 bristol 170 freighter 31
1168 boeing b-29
1028 avro anson


In [264]:
df.loc[3380]

Date                                       86
Aircraft Type 1     short sc-7 skyvan variant
Total Fatalities                         13.0
Name: 3380, dtype: object