In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
import time

from math import sqrt
from numpy import loadtxt
from itertools import product
from tqdm import tqdm
from numpy import loadtxt

import gc
from sklearn import preprocessing 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        print(str(col_type))
        if str(col_type) == "datetime64[ns]":
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            elif str(col_type) != "Timestamp":
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df
def df_info(df):
    print("----------Top-5- Record----------")
    print(df.head(5))
    print("-----------Information-----------")
    print(df.info())
    print("-----------Data Types-----------")
    print(df.dtypes)
    print("----------Missing value-----------")
    print(df.isnull().sum())
    print("----------Null value-----------")
    print(df.isna().sum())
    print("----------Shape of Data----------")
    print(df.shape)
    print("----------description of Data----------")
    print(df.describe())
    print("----------Uniques of Data----------")
    print(df.nunique())
    print("------------Columns in data---------")
    print(df.columns)

def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df
def model_performance_sc_plot( predictions, labels, title ):
    min_val = max(max(predictions), max(labels))
    max_val = min(min(predictions), min(labels))
    
    performance_df = pd.DataFrame({"Labels": labels})
    performance_df["Predictions"] = predictions
    sns.jointplot(y = "Labels", x = "Predictions", data = performance_df,kind = "reg")
    plt.plot([min_val,max_val],[min_val, max_val],"m--")
    plt.title(title)
    plt.show()
    
def calculate_counts(column_name, ref_sets, extension_set):
    ref_sets_ids = [set(ref[column_name]) for ref in ref_sets]
    ext_ids = set(extension_set[column_name])
    
    refs_union = reduce(lambda s1, s2: s1 | s2, ref_sets_ids)
    
    ref_counts = [len(ref) for ref in ref_sets_ids]
    ext_count = len(ext_ids)
    union_count = len(refs_union)
    intersection_count = len(ext_ids & refs_union)
    
    all_counts = ref_counts + [union_count, ext_count, intersection_count]
    res_index = ["Ref {}".format(i) for i in range(1, len(ref_sets) + 1)] +\
        ['Refs Union', 'Extension', 'Union x Extension']
    
    return pd.DataFrame({'Count': all_counts},
                        index=res_index)

In [None]:
train_path = "/kaggle/input/airplane-accidents-severity-dataset/train.csv"
test_path = "/kaggle/input/airplane-accidents-severity-dataset/test.csv"
submission_path = "/kaggle/input/airplane-accidents-severity-dataset/sample_submission.csv"


In [None]:
train = pd.read_csv(train_path)

In [None]:
df_info(train)

In [None]:
object_cols = ["Severity"]

lb = LabelEncoder()
lb.fit(train[object_cols])
train[object_cols] = lb.transform(train[object_cols])
train["Control_Metric"] = train["Control_Metric"].clip(25,100)

In [None]:
train.head()

Accident type code mean wrt control metric, Severity, Turbulence, Maxelevation, Adverse Weatehr

In [None]:
means_accident_code = train.groupby(["Accident_Type_Code"]).agg({"Severity": "mean", "Safety_Score": "mean", "Days_Since_Inspection": "mean", "Total_Safety_Complaints" :"mean", "Control_Metric": "mean", "Turbulence_In_gforces" :"mean", "Cabin_Temperature": "mean", "Max_Elevation" :"mean","Violations": "mean", "Adverse_Weather_Metric" : "mean"})

In [None]:
means_accident_code = means_accident_code.reset_index()

In [None]:
cols = list(means_accident_code.columns)
for i in range(1,len(cols)):
    cols[i] = cols[i] + "_mean"
print(cols)
means_accident_code.columns = cols

In [None]:
train = train.merge(means_accident_code, on = ["Accident_Type_Code"], how = "left")

In [None]:
train.drop(["Accident_Type_Code"], axis = 1, inplace = True)

In [None]:
train.head()

In [None]:
# plt.rcParams["figure.figsize"] = (15,6)
# sns.swarmplot(y = "Violations", x = "Total_Safety_Complaints", hue = "Severity", data = train)
# plt.show()

In [None]:
# plt.rcParams["figure.figsize"] = (15,6)
# sns.scatterplot(y = "Accident_Type_Code", x = "Control_Metric", hue = "Severity", data = train)
# plt.show()

In [None]:
X = train.drop(["Severity", "Accident_ID"], axis = 1)
Y = train["Severity"].values

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size = 0.25)
x_val1, x_val2, y_val1, y_val2 = train_test_split(x_val, y_val, test_size = 0.25)

In [None]:
test = pd.read_csv(test_path)

In [None]:
test.head().T

In [None]:
test = test.merge(means_accident_code, on = ["Accident_Type_Code"], how = "left")
test.drop(["Accident_Type_Code"], axis = 1, inplace = True)

In [None]:
x_test = test.drop(["Accident_ID"], axis =1)

In [None]:
cb = CatBoostClassifier(iterations = 1000,
                       max_ctr_complexity = 6,
                       random_seed =  0,
                       od_type = "Iter", 
                       od_wait = 25,
                       verbose = 20,
                       depth  = 6)
# cb.fit(x_train, y_train, eval_set = (x_test, y_test), cat_features = cat_features, verbose = 20)

In [None]:
cb.fit(x_train , y_train, eval_set = (x_val1, y_val1))

In [None]:
cb_train_pred = cb.predict(x_train)
cb_val1_pred = cb.predict(x_val1)
cb_val2_pred = cb.predict(x_val2)
cb_test_pred = cb.predict(x_test)

In [None]:
cb_train_pred

In [None]:
print("F1_Score train : ", f1_score(y_train, cb_train_pred,average='weighted'))
print("F1_Score val1 : ", f1_score(y_val1, cb_val1_pred,average='weighted'))

In [None]:
model_performance_sc_plot(cb_val1_pred, y_val1, "Validation")

In [None]:
from xgboost import XGBClassifier
xgbRegressor = XGBClassifier(max_depth = 8, eta = 0.2,n_estimators = 500, colsample_bytree = 0.7,subsample = 0.7, seed = 0)

xgbRegressor.fit(x_train, y_train, eval_set = [(x_train,y_train),(x_val1, y_val1)], verbose = 20, early_stopping_rounds = 120)


In [None]:

xgb_train_pred = xgbRegressor.predict(x_train)
xgb_val1_pred = xgbRegressor.predict(x_val1)
xgb_val2_pred = xgbRegressor.predict(x_val2)
xgb_test_pred = xgbRegressor.predict(x_test)
print("F1_Score train : ", f1_score(y_train, xgb_train_pred,average='weighted'))
print("F1_Score val1 : ", f1_score(y_val1, xgb_val1_pred,average='weighted'))

In [None]:
model_performance_sc_plot(xgb_val1_pred, y_val1, "Validation")

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 300, max_depth = 20, random_state = 0, n_jobs = 1)
rf.fit(x_train, y_train)

In [None]:
rf_train_pred = rf.predict(x_train)
rf_val1_pred = rf.predict(x_val1)
rf_val2_pred = rf.predict(x_val2)
rf_test_pred = rf.predict(x_test)

In [None]:
print("Train F1-score : " ,f1_score(y_train, rf_train_pred,average='weighted'))
print("Val1 F1-score : " ,f1_score(y_val1, rf_val1_pred,average='weighted'))

In [None]:
model_performance_sc_plot(rf_val1_pred, y_val1, "Validation")

In [None]:
# from sklearn.linear_model  import LogisticRegression
# from sklearn.preprocessing import StandardScaler, MinMaxScaler

# lr_Scaler  = StandardScaler()
# lr_Scaler.fit(x_train)
# lr_train = lr_Scaler.transform(x_train)
# lr_val1 = lr_Scaler.transform(x_val1)
# lr_val2 = lr_Scaler.transform(x_val2)
# lr_test = lr_Scaler.transform(x_test)

# svm = LogisticRegression()
# svm.fit(lr_train, y_train)


In [None]:
# svm_train_pred = svm.predict(lr_train)
# svm_val1_pred = svm.predict(lr_val1)
# svm_val2_pred = svm.predict(lr_val2)
# svm_test_pred = svm.predict(lr_test)

In [None]:
# print("Train F1-score : ", f1_score(y_train, svm_train_pred,average='weighted'))
# print("Val1 F1-score : " ,f1_score(y_val1, svm_val1_pred,average='weighted'))

In [None]:
# model_performance_sc_plot(svm_val1_pred, y_val1, "Validation")

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors = 4, leaf_size = 8,)
# knn.fit(lr_train, y_train)

In [None]:
# knn_train_pred = knn.predict(lr_train)
# knn_val1_pred = knn.predict(lr_val1)
# knn_val2_pred = knn.predict(lr_val2)
# knn_test_pred = knn.predict(lr_test)

In [None]:
# print("Train F1-score : ", f1_score(y_train, knn_train_pred,average='weighted'))
# print("Val1 F1-score : " ,f1_score(y_val1, knn_val1_pred,average='weighted'))

In [None]:
# model_performance_sc_plot(knn_val1_pred, y_val1, "Validation")

In [None]:
first_level = pd.DataFrame(cb_val1_pred, columns = ["catboost"])
# first_level["RF"] = rf_val1_pred
first_level["XGB"] = xgb_val1_pred

In [None]:
first_level_val = pd.DataFrame(cb_val2_pred, columns = ["catboost"])
# first_level_val["RF"] = rf_val2_pred
first_level_val["XGB"] = xgb_val2_pred

In [None]:
first_level_test = pd.DataFrame(cb_test_pred, columns = ["catboost"])
# first_level_test["RF"] = rf_test_pred
first_level_test["XGB"] = xgb_test_pred

In [None]:
from sklearn.linear_model  import LinearRegression

metamodel = XGBClassifier(max_depth = 8, eta = 0.2,n_estimators = 500, colsample_bytree = 0.7,subsample = 0.7, seed = 0)

In [None]:
metamodel.fit(first_level, y_val1, eval_set = [(first_level,y_val1),(first_level_val, y_val2)], verbose = 20, early_stopping_rounds = 120)

In [None]:
ensemble_pred = metamodel.predict(first_level_val)
test_pred = metamodel.predict(first_level_test)

In [None]:
print("Val F1-Score :", f1_score(y_val2, ensemble_pred, average = "weighted"))

In [None]:
model_performance_sc_plot(ensemble_pred, y_val2, "Validation")

In [None]:
# xgbRegressor.fit(X, Y, verbose = 20,)
# test_pred = xgbRegressor.predict(x_test)


In [None]:
y_pred = test_pred
y_pred = y_pred.astype(np.int32)
y_pred = lb.inverse_transform(y_pred)

In [None]:
ids = []
for i in range(test.shape[0]):
    ids.append(test.loc[i,"Accident_ID"])

In [None]:
sub = pd.DataFrame(ids, columns= ["Accident_ID"])

In [None]:
sub["Severity"] = y_pred

In [None]:
sub.head()

In [None]:
sub.to_csv("Submission.csv",index = False)