In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from scipy.stats import zscore
import warnings
warnings.filterwarnings("ignore")
sns.set(color_codes=True)

In [None]:
# load the dataset
df = pd.read_csv("../input/santander-customer-satisfaction/train.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
nulls = df.isnull().sum()
print(nulls[nulls > 0])

In [None]:
df.isnull().sum().sum()

In [None]:
df.isna().sum().sum()

In [None]:
nas = df.isna().sum()
print(nas[nas > 0])

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
coltypes = df.dtypes
print(coltypes)

In [None]:
df['TARGET'].value_counts().plot.pie(autopct='%1.2f%%',shadow=True)

In [None]:
dfTarget_0=df[df['TARGET']==0]
dfTarget_1=df[df['TARGET']==1]
print(dfTarget_0.shape)
print(dfTarget_1.shape)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
dfTarget_1_up_sampled = resample(dfTarget_1, 
                                 replace=True, 
                                 n_samples=df['TARGET'].value_counts()[0],# to match majority class sample count
                                 random_state=1) 
print(dfTarget_1_up_sampled.shape)

In [None]:
df_upsampled = pd.concat([dfTarget_0, dfTarget_1_up_sampled])
print(df_upsampled.shape)

In [None]:
X =  df_upsampled.drop(['TARGET', 'ID'],axis=1)
Y = df_upsampled['TARGET']

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
#oversample = SMOTE()
#X, Y = oversample.fit_resample(X, Y)

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
scaler = StandardScaler()
XScaled = scaler.fit_transform(X)

In [None]:
pca = PCA()
pca.fit(XScaled)

In [None]:
len(pca.explained_variance_)

In [None]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
y = np.cumsum(pca.explained_variance_ratio_ * 100)
x = np.arange(1,len(pca.explained_variance_)+1)
plt.plot(x,y)
plt.show()

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)

d = np.argmax(cumsum >= 0.995) + 1
d

In [None]:
pca = PCA(n_components=158)
pca.fit(XScaled)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
X_pca = pca.fit_transform(XScaled)

In [None]:
pca_df = pd.DataFrame(data = X_pca)
pca_df.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=100)


In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
randomForest = RandomForestClassifier(n_estimators = 200)
randomForest = randomForest.fit(x_train, y_train)

In [None]:
score = randomForest.score(x_test , y_test)
print(score)

In [None]:
y_predict = randomForest.predict(x_test)

In [None]:
print(confusion_matrix(y_test,y_predict, labels=[0, 1]))
print(classification_report(y_test, y_predict))

In [None]:
cm=metrics.confusion_matrix(y_test,y_predict, labels=[0, 1])

df_cm = pd.DataFrame(cm, index = [i for i in ["0","1"]],
                  columns = [i for i in ["Predict 0","Predict 1"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

In [None]:
test_df = pd.read_csv("/kaggle/input/santander-customer-satisfaction/test.csv")
test_df

In [None]:
resultDataSet = pd.DataFrame()
resultDataSet['ID'] = test_df['ID']
resultDataSet.head()

In [None]:
test_df = test_df.drop(['ID'], axis = 1)

In [None]:
test_df.isnull().sum().sum()

In [None]:
test_df.isna().sum().sum()

In [None]:
test_Scaled = scaler.fit_transform(test_df)

In [None]:
#pca = PCA(n_components=150)
#pca.fit(test_Scaled)
test_pca = pca.fit_transform(test_Scaled)

In [None]:
testpca_df = pd.DataFrame(data = test_pca)
testpca_df.head()

In [None]:
test_predict = randomForest.predict(testpca_df)

In [None]:
len(testpca_df)

In [None]:
len(test_predict)


In [None]:
resultDataSet['TARGET'] = test_predict
resultDataSet

In [None]:
resultDataSet.to_csv('submission.csv', index=False)