In [None]:
import numpy as np 
import pandas as pd 
import time
from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import utils_clf_models as classifier
import utils_best_search 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(10, 8)})
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the data

In [None]:
df = pd.read_csv('/kaggle/input/santander-customer-satisfaction/train.csv')
df['TARGET'].value_counts(normalize=True)

In [None]:
3008/(3008+73012)

# Exploring the data

In [None]:
print(df.shape)
print(df.info())
df.head(5)

In [None]:
df.describe()

# Feature Importance

In [None]:
X = df.drop(['TARGET'], axis=1).values
y = df['TARGET'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
feature_names = [f'feature {i}' for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

## Based on mean decrease in impurity

In [None]:
start_time = time.time()
importances = forest.feature_importances_
std = np.std([
    tree.feature_importances_ for tree in forest.estimators_], axis=0)
top_std = sorted(std, reverse=True)[:10]

elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")


imp_dict = {feature_names[i]: importances[i] for i in range(len(feature_names))}
imp_dict = dict(sorted(imp_dict.items(), key=itemgetter(0)))
forest_importances = pd.Series(list(imp_dict.values())[:10], index=list(imp_dict.keys())[:10])

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=top_std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

# Single Model Building & Evaluation

## 1-  Naive Bayes

In [None]:
nb = classifier.Classifier(X, y, 'guass_nb')
nb.preprocess_split(size=0.25, state=72)
nb.fit_predict()
nb.metrics(printing=True)
nb.roc_curve()

## 2- K-Nearest Neighbors

In [None]:
knn = classifier.Classifier(X, y, 'knn')
knn.preprocess_split(size=0.25, state=72)
knn.fit_predict()
knn.metrics(printing=True)
knn.roc_curve()

## 3- Logistic Regression

In [None]:
log = classifier.Classifier(X, y, 'log_reg')
log.preprocess_split(size=0.25, state=72)
log.fit_predict()
log.metrics(printing=True)
log.roc_curve()

## 4- Decision Tree

In [None]:
tree = classifier.Classifier(X, y, 'tree')
tree.preprocess_split(size=0.25, state=72)
tree.fit_predict()
tree.metrics(printing=True)
tree.roc_curve()

### What's next? Model Tuning (Best Parameter Search), Ensemble Learning, Model Finalization

# Ensemble Learning

## 1- Random Forest (Given the individual tree performed better than any other model, we will proceed on starting with the random forest)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators = 10000, random_state = 42, class_weight='balanced_subsample')
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
print(classification_report(y_test, pred))

## 2- Bagging (with a tree as base estmiator)

In [None]:
bag = BaggingClassifier(random_state=42)
bag.fit(X_train, y_train)
pred = bag.predict(X_test)
print(classification_report(y_test, pred))

# Obtain the submission format on the test data

In [None]:
sample = pd.read_csv('/kaggle/input/santander-customer-satisfaction/sample_submission.csv')
sample

In [None]:
test_df = pd.read_csv('/kaggle/input/santander-customer-satisfaction/test.csv')
test_df

In [None]:
X_final_test = test_df.values
final_pred = rf.predict(X_final_test)

In [None]:
frame = {'ID': pd.Series(test_df['ID']), 'TARGET': pd.Series(final_pred)}
result = pd.DataFrame(frame)
result['TARGET'].value_counts(normalize=True)

In [None]:
result.head()

In [None]:
result.to_csv('santander_customer_satisfaction_submission.csv', header=True, index=False)