# <center> Air Pressure System(APS) Sensors Failure Prediction <center>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pymongo

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix,classification_report

import warnings
warnings.filterwarnings('ignore')

### Reading the Dataset

In [None]:
df = pd.read_csv('aps_failure_training_set1.csv',na_values="na")

In [None]:
df

In [None]:
df.shape

In [None]:
df['class'].value_counts()

In [None]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

### Checking Missing Values

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

missing = df.isna().sum().div(df.shape[0]).mul(100).to_frame().sort_values(by=0, ascending = False)

ax.bar(missing.index, missing.values.T[0])
plt.xticks([])
plt.ylabel("Percentage missing")
plt.show()

### Removing columns having more than 70% missing values

In [None]:
dropcols = missing[missing[0]>70]
dropcols

In [None]:
df.drop(list(dropcols.index), axis=1, inplace=True)

In [None]:
df.shape

### Check the total percentage of missing values of full dataset after dropping columns with more than 70% of missing values

In [None]:
missing_values_count= df.isnull().sum()
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
print(f"Percentage of total missing cells in the data {(total_missing/total_cells) * 100}%")

In [None]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']

plt.figure(figsize=(15, 100))
for i, col in enumerate(numeric_features):
    plt.subplot(60, 3, i+1)
    sns.distplot(x=df[col], color='indianred')
    plt.xlabel(col, weight='bold')
    plt.tight_layout()

### Visualization of unique values in Target variable

In [None]:
pos = df[df['class']=='pos'].shape[0]
neg = df[df['class']=='neg'].shape[0]
print("Positive: " + str(pos) + ", Negative: " + str(neg))
sns.catplot(data=df, x="class", kind="count", palette="winter_r", alpha=.6)
plt.show()

## Analysis Report

- The Target column is highly imbalanced
- If the imbalanced data is not treated beforehand, then this will degrade the performance of the classifier model. 
- Resampling of data can be used to deal with the imbalanced dataset. There are two types of sampling techniques i.e., Oversampling and Undersampling.
- SMOTE is one of the oversampling techniques where the synthetic samples are generated for minority class.
- With the combination of SMOTE, TOMEK which helps in removing the noise in the data i.e. the similar type of data sample from minority and majority class which may lead to misclassification. So, these two form a TOMEK link and considered to be a noise in the dataset.

In [None]:
X = df.drop('class', axis=1)
y = df['class']

In [None]:
y = y.replace({'pos': 1, 'neg': 0})