In [1]:
import pandas as pd
import numpy as np
#%pip install scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
#%pip install shap
import shap
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import data 
df = pd.read_csv('AirlineOccurences.csv')
print(df.head())
print("----------------------------------------")
df.info()
print("----------------------------------------")
# view if there's any null value in data
print(df.isnull().sum())
print("----------------------------------------")
#count of unique values in each column
print(df.nunique())

                                              Report  \
0  MECHANICAL / LANDING GEAR GND FAIL MSG AFTER T...   
1  THE NOSE LANDING GEAR DID NOT EXTEND FULLY DUR...   
2  THE LEFT SIDE HYDRAULIC SYSTEM FILTER BOWL ASS...   
3  AIRCRAFT WAS ON ROLLOUT DURING A NORMAL LANDIN...   
4  UPON TAKEOFF ROLL BUT PRIOR TO REACHING 80 KNO...   

                    Part Failure  Occurence Nature condition  \
2    HYD FILTER FAILED            OTHER                        
3          LEFT COLLAPSED         OTHER                        

  Occurence Precautionary Procedures  
0           OTHER                     
1           ABORTED APPROACH          
2           ABORTED APPROACH          
3           OTHER                     
4           ABORTED TAKEOFF           
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100028 entries, 0 to 100027
Data columns (total 4 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------         

In [3]:
# count of each occurence : the label that y will be predicting
df['Occurence Nature condition'].value_counts()

Occurence Nature condition
OTHER                         82172
NO TEST                        3577
SMOKE/FUMES/ODORS/SPARKS       2166
FLUID LOSS                      975
FLT CONT AFFECTED               320
OVER TEMP                       316
VIBRATION/BUFFET                228
INADEQUATE Q C                  223
AFFECT SYSTEMS                  202
F.O.D.                          191
PARTIAL RPM/PWR LOSS            152
MULTIPLE FAILURE                107
FLAME/FIRE                       65
ENGINE STOPPAGE                  56
SIGNIFICANT FAILURE REPORT       53
INFLIGHT SEPARATION              47
ENGINE FLAMEOUT                  38
ELECT. POWER LOSS-50 PC          27
FLT. ATTITUDE INST.              21
Name: count, dtype: int64

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shuyiyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
#%pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# preprocessing text data using stopwords and porterstemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

#function to clean text data
def preprocess_text(text):
    # removing all punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # converting all text to lowercase
    text = text.lower()
    # removing stoping words from text data
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # apply stemmer
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [6]:
# cleaning and preprocessing text data
df['Report'] = df['Report'].str.strip().str.lower()
df['Part Failure'] = df['Part Failure'].str.strip().str.lower()
df['Occurence Nature condition'] = df['Occurence Nature condition'].str.strip().str.lower()
df['Occurence Precautionary Procedures'] = df['Occurence Precautionary Procedures'].str.strip().str.lower()

In [7]:
# split the data into training and testing sets
X = df[['Report', 'Part Failure']]
y = df['Occurence Nature condition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [8]:
# apply TF-IDF vectors for the text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(x), axis=1))
X_test_vec = vectorizer.transform(X_test.apply(lambda x: ' '.join(x), axis=1))

In [9]:
# train a linear SVM model
model = LinearSVC()
model.fit(X_train_vec, y_train)

In [10]:
# make predictions on the train set
y_predtrain_SVM = cross_val_predict(estimator = model, X = X_train_vec, y = y_train, cv = 10)

# make predictions on the test set
y_pred_SVM = model.predict(X_test_vec)

In [11]:
print('Validation Report (SVM):\n ', classification_report(y_train, y_predtrain_SVM, zero_division=True))

Validation Report (SVM):
                              precision    recall  f1-score   support

            affect systems       0.71      0.18      0.29       140
   elect. power loss-50 pc       0.67      0.17      0.28        23
           engine flameout       0.91      0.37      0.53        27
           engine stoppage       0.92      0.35      0.51        34
                    f.o.d.       0.71      0.36      0.48       130
                flame/fire       0.68      0.37      0.48        51
         flt cont affected       0.60      0.26      0.36       223
       flt. attitude inst.       1.00      0.13      0.24        15
                fluid loss       0.73      0.66      0.69       694
            inadequate q c       0.55      0.11      0.18       167
       inflight separation       0.86      0.17      0.28        36
          multiple failure       0.89      0.10      0.18        78
                   no test       0.74      0.59      0.66      2476
                    

In [12]:
print('Evaluation Report (SVM):\n', classification_report(y_test, y_pred_SVM, zero_division=True))


Evaluation Report (SVM):
                             precision    recall  f1-score   support

            affect systems       0.86      0.19      0.32        62
   elect. power loss-50 pc       0.00      0.00      0.00         4
           engine flameout       0.83      0.45      0.59        11
           engine stoppage       1.00      0.18      0.31        22
                    f.o.d.       0.71      0.36      0.48        61
                flame/fire       0.58      0.50      0.54        14
         flt cont affected       0.60      0.28      0.38        97
       flt. attitude inst.       0.67      0.33      0.44         6
                fluid loss       0.72      0.67      0.69       281
            inadequate q c       0.58      0.20      0.29        56
       inflight separation       0.00      0.00      0.00        11
          multiple failure       0.91      0.34      0.50        29
                   no test       0.75      0.63      0.68      1101
                     

In [14]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Fit the Logistic Regression model
lr_model.fit(X_train_vec, y_train)

# Make predictions on the train set
y_predtrain_lr = cross_val_predict(estimator=lr_model, X=X_train_vec, y=y_train, cv=10)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test_vec)

# Validation Report (Logistic Regression)
print('Validation Report (Logistic Regression):\n', classification_report(y_train, y_predtrain_lr, zero_division=True))

# Evaluation Report (Logistic Regression)
print('Evaluation Report (Logistic Regression):\n', classification_report(y_test, y_pred_lr, zero_division=True))

Validation Report (Logistic Regression):
                             precision    recall  f1-score   support

            affect systems       0.56      0.04      0.07       140
   elect. power loss-50 pc       1.00      0.00      0.00        23
           engine flameout       1.00      0.00      0.00        27
           engine stoppage       1.00      0.00      0.00        34
                    f.o.d.       0.81      0.19      0.31       130
                flame/fire       1.00      0.04      0.08        51
         flt cont affected       0.61      0.15      0.24       223
       flt. attitude inst.       1.00      0.00      0.00        15
                fluid loss       0.72      0.59      0.65       694
            inadequate q c       0.33      0.01      0.01       167
       inflight separation       1.00      0.00      0.00        36
          multiple failure       1.00      0.00      0.00        78
                   no test       0.75      0.53      0.62      2476
     

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train_vec, y_train)
y_prediction_rf = cross_val_predict(estimator=rf_model, X=X_train_vec, y=y_train, cv=10)
y_pred_rf = rf_model.predict(X_test_vec)



In [18]:
print('Validation Report (Random Forest):\n ', classification_report(y_train, y_prediction_rf, zero_division=True))


Validation Report (Random Forest):
                              precision    recall  f1-score   support

            affect systems       0.50      0.01      0.01       140
   elect. power loss-50 pc       1.00      0.13      0.23        23
           engine flameout       1.00      0.11      0.20        27
           engine stoppage       1.00      0.26      0.42        34
                    f.o.d.       0.82      0.18      0.29       130
                flame/fire       1.00      0.18      0.30        51
         flt cont affected       0.93      0.12      0.21       223
       flt. attitude inst.       1.00      0.13      0.24        15
                fluid loss       0.91      0.27      0.42       694
            inadequate q c       0.45      0.03      0.06       167
       inflight separation       1.00      0.17      0.29        36
          multiple failure       1.00      0.13      0.23        78
                   no test       0.86      0.49      0.62      2476
          

In [17]:
print('Evaluation Report (Random Forest):\n', classification_report(y_test, y_pred_rf, zero_division=True))

Evaluation Report (Random Forest):
                             precision    recall  f1-score   support

            affect systems       1.00      0.05      0.09        62
   elect. power loss-50 pc       1.00      0.00      0.00         4
           engine flameout       1.00      0.09      0.17        11
           engine stoppage       1.00      0.18      0.31        22
                    f.o.d.       0.92      0.18      0.30        61
                flame/fire       0.00      0.00      0.00        14
         flt cont affected       1.00      0.13      0.24        97
       flt. attitude inst.       1.00      0.33      0.50         6
                fluid loss       0.94      0.27      0.42       281
            inadequate q c       0.75      0.05      0.10        56
       inflight separation       0.50      0.09      0.15        11
          multiple failure       1.00      0.41      0.59        29
                   no test       0.87      0.52      0.65      1101
           