In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

Here in the first section we will try to visualize and explain what PCA is and how do we actually reduce the number of columns in a dataset known popularly as 'curse of dimensionality'. THen we will also visualize a high dimension dataset using t-SNE to visually see if the data is actually separable or not. And lastly will use PCA and then t-SNE to visualize the data.

In the second section we will use a normal approach and build a model using XGBoostClassifier.

In [None]:
train = pd.read_csv('../input/anomaly-detection/Participants_Data_WH18/Train.csv')
train.head()

In [None]:
test = pd.read_csv("../input/anomaly-detection/Participants_Data_WH18/Test.csv")
test.head(2)

In [None]:
train["Class"].value_counts()

In [None]:
plt.hist(train["Class"])

In [None]:
for i in train.columns[:3]:
    plt.hist(train[i])
    plt.title(i)
    plt.show()

# Resampling the Data

Here we will first remove a test set from our original Dataset and then  resample our train dataset using SMOTE. This will give us a Datset which is untouched and on which we can test our final model. We will also create a validation dataset from our training to use cross validation techniques.

In [None]:
Sm = SMOTE()
X = train.drop("Class", axis = 1)
y = train["Class"]

Splitting the original Dataset to create a Test set

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.1, stratify = y)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

Resampling the Training Dataset (oversampling is used) to create a balance in the train set.

In [None]:
x_train, y_train = Sm.fit_resample(X_train, y_train)
x_train.shape, y_train.shape, y_train.value_counts()

In [None]:
x_train_re, x_valid_re, y_train_re, y_valid_re = train_test_split(x_train, y_train, test_size = 0.1, stratify = y_train)

Training a simple Classifier using Random Forest and Catboost

In [None]:
from sklearn.ensemble import RandomForestClassifier
import catboost
from sklearn.metrics import auc, roc_curve

In [None]:
def metric(preds, target):
    fpr, tpr, thresholds = roc_curve(target, preds)
    return auc(fpr, tpr)

In [None]:
Rf = RandomForestClassifier()
model_Rf = Rf.fit(x_train_re, y_train_re)
preds = model_Rf.predict(x_valid_re)
print(metric(preds, y_valid_re))


# PCA for Dimensionality Reduction

Principal Component Analysis is that technique which is used to reduce the dimensionality of the Dataset. In short it is used to reduce the number of columns in a dataset. No we dont just throw away the columns directly but do this reduction in number of columns systematically using some Math behind it which is taking out the principal component axes using Eigen values and Eigen vectors. These axes which we get are actually very good at explaining the entire variance in the dataset by not loosing much of the dataset information. 

In short these axes or principal components helps us to represent our high dimensional data with equivalent information on a lower dimension space. PCA is a very popular technique in Dimensionality reduction. We would try to use PCA on our Dataset and also try to use T-sne algorithm to visualize our reduced dataset.

For this we will use our original Dataset.

In [None]:
pca = PCA(n_components=3, random_state=52)

In [None]:
pca_result = pca.fit_transform(X)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
train_copy = train.copy()
train_copy['pca-one'] = pca_result[:,0]
train_copy['pca-two'] = pca_result[:,1] 
train_copy['pca-three'] = pca_result[:,2]

In [None]:
rndperm = np.random.permutation(train_copy.shape[0])
plt.figure(figsize=(10,8))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="Class",
    palette=sns.color_palette("hls", 2),
    data= train_copy.loc[rndperm,:],
    legend="full",
    alpha=0.3
)

3D version of the same plot:

In [None]:
ax = plt.figure(figsize=(10,8)).gca(projection='3d')
ax.scatter(
    xs=train_copy.loc[rndperm,:]["pca-one"], 
    ys=train_copy.loc[rndperm,:]["pca-two"], 
    zs=train_copy.loc[rndperm,:]["pca-three"], 
    c=train_copy.loc[rndperm,:]["Class"], 
    cmap='tab10'
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

# t-SNE

In [None]:
import time
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(X)

In [None]:
train_copy['tsne-2d-one'] = tsne_results[:,0]
train_copy['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(10,8))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="Class",
    palette=sns.color_palette("hls", 2),
    data=train_copy,
    legend="full",
    alpha=0.3
)

# Using PCA and t-SNE together

We will now use the reduced dimensions from the PCA to visualize the data using t-SNE.

In [None]:
pca_50 = PCA(n_components=50)
pca_result_50 = pca_50.fit_transform(X)
print('Cumulative explained variation for 50 principal components: {}'.format(np.sum(pca_50.explained_variance_ratio_)))

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=1000)
tsne_pca_results = tsne.fit_transform(pca_result_50)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
train_copy['tsne-pca50-one'] = tsne_pca_results[:,0]
train_copy['tsne-pca50-two'] = tsne_pca_results[:,1]
plt.figure(figsize=(16,4))
ax1 = plt.subplot(1, 3, 1)
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="Class",
    palette=sns.color_palette("hls", 2),
    data=train_copy,
    legend="full",
    alpha=0.3,
    ax=ax1
)
ax2 = plt.subplot(1, 3, 2)
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="Class",
    palette=sns.color_palette("hls", 2),
    data=train_copy,
    legend="full",
    alpha=0.3,
    ax=ax2
)
ax3 = plt.subplot(1, 3, 3)
sns.scatterplot(
    x="tsne-pca50-one", y="tsne-pca50-two",
    hue="Class",
    palette=sns.color_palette("hls", 2),
    data=train_copy,
    legend="full",
    alpha=0.3,
    ax=ax3
)

# Normal Approach

In [None]:
train = pd.read_csv('../input/anomaly-detection/Participants_Data_WH18/Train.csv')
train.head(2)

lets check if there are any duplicate columns present in the Dataset

In [None]:
train = train.T.drop_duplicates().T

In [None]:
train.shape

Lets check if the number of unique values in the dataset is only 1 and if that is the case just delete it.

In [None]:
plt.rcParams['figure.figsize'] = 20,6
plt.subplot(131)
sns.boxplot(train["Class"], train["feature_1"])
plt.subplot(132)
sns.boxplot(train["Class"], train["feature_2"])
plt.subplot(133)
sns.boxplot(train["Class"], train["feature_3"])


In [None]:
df = pd.DataFrame((train == 0).astype(int).sum(axis=0))

In [None]:
df

In [None]:
all_zero = df[df[0]>1761].index

train.drop(all_zero,axis=1,inplace=True)

In [None]:
train.info()

Building the model

In [None]:
X = train.drop("Class", axis = 1)
y = train["Class"]

X_train, X_valid , y_train, y_valid = train_test_split(X, y, test_size = 0.2, stratify = y)

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier(silent=True,
                      booster = 'gbtree',
                      scale_pos_weight=5,
                      learning_rate=0.01,  
                      colsample_bytree = 0.7,
                      subsample = 0.5,
                      max_delta_step = 3,
                      reg_lambda = 2,
                     objective='binary:logistic',
                      
                      n_estimators=818, 
                      max_depth=8,
                     )

In [None]:
%%time
eval_set = [(X_valid, y_valid)]
eval_metric = ["logloss"]
model.fit(X_train, y_train,early_stopping_rounds=50, eval_metric=eval_metric, eval_set=eval_set)

Evaluating the model

In [None]:
predictions = model.predict_proba(X_valid)[:, -1]

score = roc_auc_score(y_valid, predictions)
score

Using the PCA to remove some columns from the Dataset

In [None]:
pca_3 = PCA(n_components=3)
pca_result_3 = pca_3.fit_transform(X)
print('Cumulative explained variation for 50 principal components: {}'.format(np.sum(pca_50.explained_variance_ratio_)))

In [None]:
df_3 = pd.DataFrame(pca_result_3, columns=["pca1", 'pca2', 'pca3'])

df_3.head(2)

In [None]:
X_train, X_valid , y_train, y_valid = train_test_split(df_3, y, test_size = 0.2, stratify = y)

In [None]:
%%time
eval_set = [(X_valid, y_valid)]
eval_metric = ["logloss"]
model.fit(X_train, y_train,early_stopping_rounds=50, eval_metric=eval_metric, eval_set=eval_set)

In [None]:
predictions = model.predict_proba(X_valid)[:, -1]

score2 = roc_auc_score(y_valid, predictions)
score2