In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import skew,boxcox
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [2]:


def remove_skewness(data) :
    columns = data.columns

    # removing the skewness from the data set
    skew_threshold = 0.5

    for col in columns : 
        skewness_before = skew(data[col])

        if abs(skewness_before) > skew_threshold:
            # Apply Box-Cox transformation and find the best lambda
            min_value = data[col].min()
            if (min_value<=0):
                data[col] += (-min_value+1)
            transformed_data, lambda_best_fit = boxcox(data[col])  # Adding 1 to avoid zero values
            data[col] = transformed_data  # Replace the original feature with the transformed data
        
        skewness_after = skew(data[col])

    return data


def oneHotEncoding(data):
    # performing one hot encoding on discrete features
    discrete_features = ['cp_dose','cp_time','cp_type']
    data = pd.get_dummies(data,columns=discrete_features,dtype=int)
    return data 

def remove_outliers(data,Y):
    s = set()
    columns = data.columns

    for col in columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1

        # Define lower and upper bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        for index, row in data.iterrows():
            if index in s:
                continue
            if (row[col]<lower_bound or row[col]>upper_bound):
                data = data.drop(index,axis=0)
                
                Y = Y.drop(index,axis=0)
                
                s.add(index)
    data = data.reset_index(drop = True)
    Y = Y.reset_index(drop = True)
    data = pd.DataFrame(data)
    Y = pd.DataFrame(Y)
    return data, Y

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tqdm import tqdm

# Load data
train_features = pd.read_csv('train_features.csv')
train_targets = pd.read_csv('train_targets_scored.csv')
train_features = train_features.drop('sig_id', axis=1)
train_targets = train_targets.drop('sig_id', axis=1)

train_features, test_features, train_scored, test_scored = train_test_split(
    train_features, train_targets, test_size=0.2, random_state=42
)

train_features = train_features.reset_index(drop=True)
train_targets = train_targets.reset_index(drop=True)
train_scored = train_scored.reset_index(drop=True)
test_scored = test_scored.reset_index(drop=True)

train_features = train_features[:5000]
test_features = test_features[:5000]
train_scored = train_scored[:5000]
test_scored = test_scored[:5000]

# Preprocess data
# train_features = remove_skewness(train_features)
train_features = oneHotEncoding(train_features)
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

# K-Fold Cross Validation
kfold = 3
kf = KFold(n_splits=kfold, shuffle=True, random_state=42)

# Neural Network parameters
input_dim = train_features.shape[1]
output_dim = train_scored.shape[1]
epochs = 10
batch_size = 32

# List of PCA values
pca_values = [50, 100, 200, 500]

for pca_value in pca_values:
    print(f"PCA: {pca_value}")
    avg_training = []
    avg_validation = []

    for train_index, test_index in tqdm(kf.split(train_features), total=kfold, desc="KFold Progress"):
        cross_val_scores = []
        train_cross_val_score = []

        X_train, X_test = train_features[train_index], train_features[test_index]
        y_train, y_test = train_scored.iloc[train_index].values, train_scored.iloc[test_index].values

        # Apply PCA
        pca = PCA(n_components=pca_value)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)

        # Build the neural network model
        model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(pca_value,)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(output_dim, activation='sigmoid')
        ])

        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy')

        # Train the model
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0)

        # Make predictions
        y_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)

        # Calculate cross-entropy loss
        loss = np.mean(tf.keras.losses.binary_crossentropy(y_test, y_pred))
        loss_train = np.mean(tf.keras.losses.binary_crossentropy(y_train, y_train_pred))

        cross_val_scores.append(loss)
        train_cross_val_score.append(loss_train)

        avg_training.append(np.mean(train_cross_val_score))
        avg_validation.append(np.mean(cross_val_scores))

    # Print or use the average training and validation scores as needed
    print("Average Training Loss:", np.mean(avg_training))
    print("Average Validation Loss:", np.mean(avg_validation))
    print("="*50)


PCA: 50


KFold Progress:   0%|          | 0/3 [00:00<?, ?it/s]



KFold Progress:  33%|███▎      | 1/3 [00:03<00:07,  3.81s/it]



KFold Progress:  67%|██████▋   | 2/3 [00:06<00:03,  3.44s/it]



KFold Progress: 100%|██████████| 3/3 [00:10<00:00,  3.35s/it]


Average Training Loss: 0.015783455
Average Validation Loss: 0.018885834
PCA: 100


KFold Progress:   0%|          | 0/3 [00:00<?, ?it/s]



KFold Progress:  33%|███▎      | 1/3 [00:03<00:06,  3.11s/it]



KFold Progress:  67%|██████▋   | 2/3 [00:06<00:03,  3.06s/it]



KFold Progress: 100%|██████████| 3/3 [00:09<00:00,  3.06s/it]


Average Training Loss: 0.014804944
Average Validation Loss: 0.018747715
PCA: 200


KFold Progress:   0%|          | 0/3 [00:00<?, ?it/s]



KFold Progress:  33%|███▎      | 1/3 [00:03<00:07,  3.52s/it]



KFold Progress:  67%|██████▋   | 2/3 [00:06<00:03,  3.43s/it]



KFold Progress: 100%|██████████| 3/3 [00:10<00:00,  3.39s/it]


Average Training Loss: 0.0136337085
Average Validation Loss: 0.018468352
PCA: 500


KFold Progress:   0%|          | 0/3 [00:00<?, ?it/s]



KFold Progress:  33%|███▎      | 1/3 [00:04<00:08,  4.42s/it]



KFold Progress:  67%|██████▋   | 2/3 [00:08<00:04,  4.41s/it]



KFold Progress: 100%|██████████| 3/3 [00:14<00:00,  4.85s/it]

Average Training Loss: 0.011711046
Average Validation Loss: 0.018377213



