In [82]:
# https://www.kaggle.com/code/michalgnacik/tabularplayground-aug2022-xgboostattempt - Reference for Feature engineering

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import os
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as Pipeline2
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# data scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# dimensionality reduction
from sklearn.decomposition import PCA

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import StackingClassifier

## Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score

# Hyperparameter tuning / search
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

# To handle imbalanced ds
from sklearn.utils import class_weight

# Oversampling 
from imblearn.over_sampling import SMOTE, SVMSMOTE
from imblearn.combine import SMOTETomek

# Ploting Other
import plotly.express as px
import seaborn as sns

# Dimensionality reduction
from sklearn.decomposition import PCA

# Dumping model
import pickle
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [83]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2022/test.csv')

In [84]:
original = df_train.copy(deep = True)
df_train.head()

In [85]:
df_train.drop(labels = ['id'], axis = 1, inplace = True)
# attributes = ['attribute_0','attribute_1','attribute_2','attribute_3']
# productCode = df_train['product_code'].unique()
# attribute_0 = df_train['attribute_0'].unique()
# attribute_1 = df_train['attribute_1'].unique()
# attribute_2 = df_train['attribute_2'].unique()
# attribute_3 = df_train['attribute_3'].unique()

In [8]:
# failure = {product:pd.DataFrame() for product in productCode}
# for product,_ in (failure.items()):
#     failure[product]['failureCount'] =df_train[df_train['product_code']==product].groupby('failure', as_index = False).agg({'failure':'count'})

In [86]:
sel_columns = df_test.columns[2:]
sel_columns

In [87]:
sel_columns = df_test.columns[2:]
num_bins = 100
fig, axs = plt.subplots(sel_columns.shape[0]//3+1, 3, figsize=(15,20))
fig.tight_layout()
i = 0
for k, col in enumerate(sel_columns):
    try:
        bins_min = np.min([df_train[col].min(), df_test[col].min()])
        bins_max = np.max([df_train[col].max(), df_test[col].max()])
        bins = np.linspace(bins_min, bins_max, num_bins)
    except TypeError:
        bins = 5
    df_train[col].hist(ax=axs[k//3][i%3], alpha=0.7, bins=bins, color="red", label="train", density=True)
    df_test[col].hist(ax=axs[k//3][i%3], alpha=0.7, bins=bins, color="blue", label="test", density=True)
    axs[k//3][i%3].legend()
    axs[k//3][i%3].set_title(col)
    clean_col = df_train[col].dropna()
    try:
        all_val, bin_ends = np.histogram(clean_col, bins=bins)
        failure_val, bin_ends = np.histogram(df_train[col][df_train.failure == 1], bins=bins)
        ax = axs[k//3][i%3].twinx()
        ax.scatter(
            bin_ends[:-1],
            failure_val/all_val,
            color="green",
            s=40, 
            label="prob of failure")
        ax.legend()
    except TypeError:
        pass
    i+=1
    

In [88]:
##Find NaN in dataframe
cols = df_train.columns
print("Num of NaNs in each column of dataset")
for col in cols:
    print(f'{col} contains {df_train[col].isna().sum().sum()} NaN values')

In [89]:
cols = df_test.columns
print("Num of NaNs in each column of dataset")
for col in cols:
    print(f'{col} contains {df_test[col].isna().sum().sum()} NaN values')

In [90]:
def add_stats(df):
    df["mean_measurement"] = df.loc[:, "measurement_3": "measurement_16"].mean(axis=1)
    df["median_measurement"] = df.loc[:, "measurement_3": "measurement_16"].median(axis=1)
    df["std_measurement"] = df.loc[:, "measurement_3": "measurement_16"].std(axis=1)

In [91]:
add_stats(df_train)
add_stats(df_test)

In [92]:
def emphasis_missing(df):
    df['m_3_missing'] = df.measurement_3.isna()
    df['m_5_missing'] = df.measurement_5.isna()
    

emphasis_missing(df_train)
emphasis_missing(df_test)

In [94]:
# products = {product:pd.DataFrame() for product in productCode}
# for product,_ in (products.items()):
#     products[product] = df_train[df_train['product_code']==product].reset_index(drop=True)

In [93]:
nullColumns = df_train.columns[df_train.isna().any()].tolist()
i=0

fig, axs = plt.subplots(len(nullColumns)//3+1, 3, figsize=(15,20))
fig.tight_layout()
plt.suptitle('Missing Data by product group')
for k,col in enumerate(nullColumns):
    Y = df_train[df_train[col].isna()].groupby('failure', as_index = False).agg({'failure':'count'}).squeeze()
    sns.barplot(ax =axs[k//3][i%3], x = [0,1],y = Y, color = 'b')
    axs[k//3][i%3].set_title(f" Failure rate if {col} is missing")
    i = i+1

In [95]:
msno.matrix(df_train)

In [110]:
def drawCorrelationMatrix(df):
    
    cmap = sns.diverging_palette(250, 15, s=75, l=40,
                                 n=9, center="light", as_cmap=True)
    matrix = df.corr()
    mask = np.triu(np.ones_like(matrix, dtype=bool))
    plt.figure(figsize=(16,12))
    plt.title("Correlation heatmap between all parameters")
    fig = sns.heatmap(matrix, mask=mask, center=0, annot=True,
                 fmt='.2f', square=True, cmap=cmap)    
    
def getRedundantPairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def getTopAbsCorrelations(df, n=5,method = 'pearson'):
#   ''' Get top 5 most correlated features in the given dataset. Method can be 'pearson', 'spearman', 'kendall' ''''
    au_corr = df.corr(method = method).abs().unstack()
    labels_to_drop = getRedundantPairs(df)
    au_corr = au_corr.sort_values(ascending=False)
#     for pairs in labels_to_drop:
#         au_corr = au_corr.drop([pairs[0],pairs[1]])
#     au_corr = au_corr.sort_values(ascending=False)
    
    high_corr_pairs = au_corr[(au_corr < 1.0) & (au_corr  >= 0.3)] 
    print(high_corr_pairs)
    features_to_merge = high_corr_pairs.index.get_level_values(0).unique().tolist()
    print("Top {} Correlated features in given dataset".format(n))
    return features_to_merge

In [107]:
drawCorrelationMatrix(df_train)

In [111]:
features_to_merge = getTopAbsCorrelations(df_train.loc[:,"measurement_0":"measurement_17" ])

In [112]:
features_to_merge

In [113]:
numerical_transformer = KNNImputer(n_neighbors=5)# SimpleImputer(strategy='mean') # in numerical we replace nans with median
scaler = StandardScaler(with_std=False)
pca = PCA(n_components=1, svd_solver='full', random_state=0)
pipeline = Pipeline([
    ('preprocess', numerical_transformer),
    ('scaler', scaler),
    ('pca', pca)])


pipeline.fit(df_train[features_to_merge])


In [114]:
measure_merge_train = pipeline.transform(df_train[features_to_merge])
measure_merge_test = pipeline.transform(df_test[features_to_merge])

In [115]:
df_train["measurement_merged"] = measure_merge_train
df_test["measurement_merged"] = measure_merge_test

In [116]:
df_train["failure"] = df_train.pop('failure')

In [119]:
correlation_val = np.abs(df_train.corr()["failure"]).sort_values()
drop_columns = list(correlation_val[correlation_val <=0.007].index)

df_train.drop(columns=drop_columns, inplace=True)
df_test.drop(columns=drop_columns, inplace=True)

In [120]:
X = df_train.loc[:, df_train.columns != 'failure']
y = df_train.failure

In [121]:
seed = 1
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

In [122]:
num_columns =  X.select_dtypes(include=["number"]).columns # selecting numerical columns
non_num_columns = X.select_dtypes(exclude=["number"]).columns

In [123]:
numerical_transformer = KNNImputer(n_neighbors=5)# SimpleImputer(strategy='mean') # in numerical we replace nans with median

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), 
        ("imputer", SimpleImputer(strategy="most_frequent"))]) # with numerical we replace the nan with mode

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_columns),
        ('cat', categorical_transformer, non_num_columns)
    ])

In [124]:
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

In [126]:
random_state = 42
clf = LogisticRegression(random_state=random_state) 
scaler = StandardScaler()
sampler = SMOTE(random_state=0)
pipeline = Pipeline2([
    ('preprocess', preprocessor),
    ('sampler', sampler),
    ('scaler', scaler),
    ('clf', clf)
]) # Making a pipeline
pipeline.fit(X_train, y_train) # Fitting the model

##Train Scores
y_pred = pipeline.predict(X_train) 
probs_train = pipeline.predict_proba(X_train)[:, 1]
auc_train = roc_auc_score(y_train, probs_train)
f1score_train = f1_score(y_train, y_pred)
print(f"AUC score train: {auc_train}, f1 score train: {f1score_train}")
## Validation Scores
y_pred = pipeline.predict(X_val) # Obtaining predictions on validation part that the model has never seen
probs_train = pipeline.predict_proba(X_val)[:, 1]
auc_val = roc_auc_score(y_val, probs_train)
f1score_val = f1_score(y_val, y_pred)
print(f"AUC score test: {auc_val}, f1 score test: {f1score_val}")

In [127]:
filename = '/kaggle/working/finalized_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))

In [130]:
y_pred_proba = loaded_model.predict_proba(df_test)[:, 1]

In [131]:
df_sample_sub = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2022/sample_submission.csv")
df_submission = pd.DataFrame()
df_submission["id"] = df_sample_sub.id
df_submission["failure"] = y_pred_proba 

In [132]:
df_submission.to_csv("submission.csv", index=False)