# Importing the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
data = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')

# Exploratory Data Analysis

Basic details of the data:
 
- Shape
- features
- null values
- data types
- sample of the data
- statistical description 

In [None]:
print("The number of columns in the data",data.shape[1])
print("The number of rows in the data",data.shape[0])

In [None]:
print("The data types of the data are as follows:")
data.info(verbose=True, show_counts=True)

In [None]:
data.head()

In [None]:
data.describe(include='all')

# Understanding the data visually

In [None]:
#Converting target to category

data['target'] = data['target'].astype('category')
target = data['target'].value_counts().to_frame()

fig , ax = plt.subplots(figsize=(15,8))
ax.bar(target.index,target['target'])
ax.xaxis.set_ticks([0,1])
ax.xaxis.set_ticklabels(['No transaction','transaction'])
ax.xaxis.set_ticklabels(ax.xaxis.get_ticklabels(),fontsize=20)

for x in ax.patches:
    value = x.get_height()
    ax.text(x.get_x() + x.get_width()/2,value+10,value, ha='center', fontsize=20)
    
ax.set_title('Count of Target', fontsize = 20)
ax.set_ylabel('Count')

plt.show()

In [None]:
def range_val(X):
    
    ran = {}
    for col in X.columns:
        if X[col].dtype == 'float64':
            minimum = min(X[col])
            maximum = max(X[col])
            ran[col] = maximum - minimum
    return ran

In [None]:
col_range = range_val(data)
col_range1 = pd.DataFrame(col_range,index=['range']).melt()

labels = np.arange(0,200)

ax = col_range1.plot(kind='bar',figsize=(30,10))
ax.xaxis.set_ticklabels(labels,fontsize=6,rotation = 30)

ax.set_title('Range of values across columns',fontsize=20)
ax.set_xlabel('Columns')
ax.set_ylabel('values')

plt.show()

In [None]:
def min_max(X):
    
    minmax = {}
    for col in X.columns:
        if X[col].dtype == 'float64':
            minimum = min(X[col])
            maximum = max(X[col])
            minmax[col] = [minimum,maximum]
    return minmax

In [None]:
col_minmax = min_max(data)
col_minmax1 = pd.DataFrame(col_minmax,index=['Min','Max'])
fig, ax = plt.subplots(figsize=(30,8))
ax.boxplot(col_minmax1, showfliers=False,showbox=True, showmeans=False, whis=[0,100])
ax.axhline(0, label="zeroline")
labels = np.arange(0,200)
ax.xaxis.set_ticklabels(labels, fontsize=6, rotation =30)

ax.set_title('Min and Max values across columns', fontsize=20)
ax.set_xlabel('Columns')
ax.set_ylabel('values')

plt.show()

In [None]:
tran = data[data['target']==1]
non = data[data['target']==0]

I am plotting the distributions of each column with respect to the target variable being 0 and 1. \
This is done to observe if there are any considerable differences with the distributions for the transactions vs the non transactions.

In [None]:
features100 = data.columns[2:102]
#features100

fig, ax = plt.subplots(nrows=10,ncols=10,figsize=(30,30))

fig.tight_layout()

for x,col in zip(ax.flat,features100):
   
    #plt.subplot(10,10,i)
    x.hist(tran[col],density=True, label='1')
    x.hist(non[col],density=True,alpha=0.7, label='0')
    x.legend()
fig.suptitle('Distribution of data with respect to target transaction and non transaction', fontsize=20,y=1.03 )


plt.show()

In [None]:
features200 = data.columns[102:]

fig, ax = plt.subplots(10,10, figsize=(30,30))

fig.tight_layout()

for x,col in zip(ax.flat,features200):
    
    x.hist(tran[col],density=True, label='1')
    x.hist(non[col],density=True, alpha=0.7,label='0')
    x.legend()
fig.suptitle('Distribution of data with respect to target transaction and non transaction', fontsize=20,y=1.03)
plt.show()

We sure do observe slight difference in the variation of distribution but noting note worthy.\
This variabtion could be taken as randomness in collecting data.

In [None]:
col = {}

for x in data.columns:
    
    if data[x].dtype == 'float64':
        col[x] = data[x].value_counts().index[0],max(data[x].value_counts())

duplicate = pd.DataFrame(col,index=['value','count'])
#duplicate.sort_values('count',ascending=False,axis=1, inplace=True)

duplicate = duplicate.T

Here we find the highest count of duplicates in the data across columns.\
We find the column var_68 has a single value which is unusally high. This column needs further analysis to see if there has been more transactions with duplicate values

In [None]:
fig, ax = plt.subplots(figsize=(30,10))

ax.bar(duplicate.index, duplicate['count'])

ax.set_xticks(np.arange(len(duplicate)))
ax.set_xticklabels(np.arange(0,200),rotation = 30, fontsize=7)

for x,value in zip(ax.patches, duplicate['value']):
    
    ax.text(x.get_x()+x.get_width()/2, x.get_height()+10, value, ha='center',rotation=90, fontsize=7)

ax.set_title('Highest duplicates and the value of dupicates',fontsize=20)
plt.setp(ax,xlabel='variable',ylabel='count',ylim=[0,1220])
plt.show()

In [None]:
ax = data[data['var_68'] == 5.0214][['var_68','target']].groupby('target').count().plot(kind='bar',figsize=(15,8))
ax.set_title('Analysing the highest dupicate value with the target', fontsize=20)
ax.set_ylabel('count')
plt.show()

observing this graph we do find that there is not much of abnormality. the porportion of trancations(1) is comparable to the whole dataset.

Down below I display the correlation table.\
From the table it is very evident that the features do not have high correlation with the target variable.

In [None]:
corr = data.corr()
corr

# Model Building

First, I am importing the test dataset.\
Then importing all the required libraries

Splitting the data to train and test.
First applying Naive bayes, getting a benchmark score.

Now implementing standardscaler and then a quantile transformation.
This is to bring the data to a gaussian distribution and move all the values to mean of 0.

This helps imporve the prediction. I ended up with a accuracy of about 0.923

Now finally implementing the model in the test set and submitting.

In [None]:
test = pd.read_csv('../input/santander-customer-transaction-prediction/test.csv')

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.pipeline import Pipeline

In [None]:
X = data.drop(['target','ID_code'],axis=1)
y = data['target']
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [None]:
NB = GaussianNB()
NB.fit(X_train,y_train)
predict = NB.predict(X_test)

In [None]:
confusion_matrix(y_test,predict)
accuracy_score(y_test,predict)

In [None]:
scaler = StandardScaler()
qa = QuantileTransformer(output_distribution='normal')
sfk = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=50)

In [None]:
pipeline = Pipeline(steps=[('t',scaler),('q',qa),('m',NB)])
pipeline.fit(X_train,y_train)
score = cross_val_score(pipeline,X_train,y_train,scoring='accuracy',cv = sfk)

In [None]:
score.max()

In [None]:
predict = pipeline.predict_proba(test.drop('ID_code',axis=1))[:,1]

In [None]:
sample_submission = pd.read_csv('../input/santander-customer-transaction-prediction/sample_submission.csv')
sample_submission['target'] = predict 

In [None]:
sample_submission.head()
sample_submission.to_csv('submission.csv',index=False)

# Conclusion:

After understanding the data through EDA, and applying Naive Bayes I have mangaged to get a accuracy score of 0.923.

# Reference:

https://www.kaggle.com/blackblitz/gaussian-naive-bayes
