In [None]:
# Libraries to handle dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB

# library to ignore warnings if occur
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Reading Datasets

In [None]:
# Train Dataset
train = pd.read_csv('../input/santander-customer-satisfaction/train.csv', index_col=['ID'])         # Making ID as an index column

# Test Dataset
test = pd.read_csv("../input/santander-customer-satisfaction/test.csv", index_col=0)                # ID as index column

# Sample Dataset
samp_sub = pd.read_csv('../input/santander-customer-satisfaction/sample_submission.csv')

In [None]:
train.head()

In [None]:
train['TARGET'].unique()

In [None]:
train.shape

In [None]:
train.isna().info()

In [None]:
train.dtypes      # DataTypes

# Split DataSet

In [None]:
X = train.iloc[:,0:-1]      # Selecting Feature Columns
Y = train.iloc[:,-1]        # Selecting Target Column

In [None]:
X.shape, Y.shape

# Normalization

In [None]:
scaler = StandardScaler()
Xscaled = scaler.fit_transform(X)

In [None]:
testscaled = scaler.transform(test)

In [None]:
print(Xscaled)
print(testscaled)

# MODEL :: PCA 

In [None]:
pca = PCA()

In [None]:
principalComponents = pca.fit_transform(Xscaled)

In [None]:
print(pca.explained_variance_)

In [None]:
print(np.sum(pca.explained_variance_))

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
print(pca.explained_variance_ratio_ * 100)

In [None]:
len(pca.explained_variance_ratio_)    # Checking Length

# Vizualization

In [None]:
### Scree Plot
y = np.cumsum(pca.explained_variance_ratio_ * 100)
x = np.arange(1,370)
plt.plot(x,y)
plt.show()

In [None]:
np.sum(pca.explained_variance_ratio_)

In [None]:
temp = pca.explained_variance_ratio_ * 100

In [None]:
np.sum(temp[:107])

In [None]:
new_pca = PCA(n_components=107, random_state = 2021)

In [None]:
new_pca.fit_transform(Xscaled)

In [None]:
testscaled = new_pca.transform(testscaled)

In [None]:
testscaled

In [None]:
# Selecting 1st 10 Principal Components
PCX = pd.DataFrame(principalComponents[:,:107])

# MODEL :: Gaussian Naive Bayes

In [None]:
gaussian = GaussianNB()
gaussian.fit(PCX, Y)

In [None]:
y_pred = gaussian.predict_proba(testscaled)

In [None]:
y_pred[0]

In [None]:
samp_sub.head()

In [None]:
submit = pd.DataFrame(y_pred, index = samp_sub.ID, columns=samp_sub.columns[:2])

In [None]:
submit = submit.drop('ID', axis=1)
submit.head()

In [None]:
submit.shape

In [None]:
submit.to_csv("GaussianNB_pca.csv")