# Data science tutorial from scratch including EDA process, feature selection and algorithm

In [None]:
from IPython.display import Image
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, f1_score, recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.ensemble import BaggingClassifier, IsolationForest

**Co-relation**

In [None]:
from sklearn.feature_selection import chi2
from scipy import stats  # Anova
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import pointbiserialr

**Algorithm**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
import statsmodels.api as sm

# Attachments for this tutorial
**[https://madhanprivate.blogspot.com/2020/03/datasciecne-tutorial-attachments.html](https://madhanprivate.blogspot.com/2020/03/datasciecne-tutorial-attachments.html)**

In [None]:
bank_data = pd.read_csv("/kaggle/input/banking/banking.csv")

In [None]:
bank_data.head()

# Description of the dataset
Attribute Information:

Input variables:
# bank client data:
1 - age (numeric)
2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self employed','services','student','technician','unemployed','unknown')
3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5 - default: has credit in default? (categorical: 'no','yes','unknown')
6 - housing: has housing loan? (categorical: 'no','yes','unknown')
7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# related with the last contact of the current campaign:
8 - contact: contact communication type (categorical: 'cellular','telephone')
9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14 - previous: number of contacts performed before this campaign and for this client (numeric)
15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
# social and economic context attributes
16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
17 - cons.price.idx: consumer price index - monthly indicator (numeric)
18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
20 - nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target):
21 - y - has the client subscribed a term deposit? (binary: 'yes','no')

In [None]:
bank_data.info()

In [None]:
bank_data.shape

In [None]:
# pd.isnull(bank_data).sum()
bank_data.isnull().sum().plot()

In [None]:
bank_data.describe()

In [None]:
bank_data['job'].value_counts()

Value counts for all the feature

In [None]:
for iterate in bank_data:
    print("#####",iterate,"#####")
    print(bank_data[iterate].value_counts())

In [None]:
bank_data.job.value_counts().plot(kind='bar', figsize=(8,10))
plt.show()

Seperating Object and Non-Object datatype values

In [None]:
bank_object_data = bank_data.select_dtypes(include="object")

In [None]:
bank_object_data.info()

In [None]:
bank_non_object_data = bank_data.select_dtypes(exclude="object")

In [None]:
bank_non_object_data.info()

In [None]:
bank_non_object_data.describe()

# Getting p-value, r-squared and adjusted r-squared value

In [None]:
stats_target = bank_non_object_data.loc[:,"y"]

In [None]:
 stats_input = bank_non_object_data.drop("y", axis=1)

In [None]:
stats_input_data = sm.add_constant(stats_input) 
estimation = sm.OLS(stats_target, stats_input_data)
result = estimation.fit()

In [None]:
print("@@@@summary@@@@\n\n",result.summary())
print("@@@@pvalues@@@@\n\n",result.pvalues)
print("@@@@tvalues@@@@\n\n",result.tvalues)
print("@@@@rsquared@@@@\n\n",result.rsquared)
print("@@@@rsquared_adj@@@@\n",result.rsquared_adj)

In [None]:
for attr in dir(result):
    print(attr)

# Co-relation 
Pearson, Spearman, Point bi-serial, Chi-square

# Pearson Co-relation
Both inputs and output are numerical values (Works efficient if the categories is less than 2 (i.e) True or false / Male or female / 0 or 1) #Assuming values are normally distributed (Linear relationship) (Check Pair plot graph to slect the appropriate test )
# Spear-man Co-relation 
Both inputs and output are numerical values (Works efficient if the categories is less than 2 (i.e) True or false / Male or female / 0 or 1) #Assuming values are not normally distributed (Monotonic relationship) #Values are evaluated based on the ranks
# Pearson Chi- square test 
Both inputs and output are category(Text) values (Works efficient if the categories is more than 2 (i.e) Address data, Country names) **Should convert all the category values to the numeric values using the label encoder
# point biserial correlation 
Input should be category and output be dichotomous numeric value (Works efficient if the categories is equal to 2 (i.e) True or false / Male or female / 0 or 1) **Should convert all the category values to the numeric values using the label encoder
# Anova Co-relation 
Both inputs and output may be in numerical or category values and categories can be any. It helps to find the Co-relation between the multiple groups

In [None]:
pearsonr(bank_non_object_data['age'], bank_data['y'])

In [None]:
bank_data.corr()

In [None]:
spearmanr(bank_non_object_data['age'], bank_data['y'])

In [None]:
# chi_scores = chi2(bank_non_object_data['age'],bank_data['y'])
# chi_scores

In [None]:
# pointbiserialr(bank_object_data['education'], bank_data['y'])

In [None]:
# Anova
F, p = stats.f_oneway(bank_data[bank_data.y==0].age,
                      bank_data[bank_data.y==1].age)
print(p)

In [None]:
# sns.pairplot(bank_data)

Converting contegorical values to discrete values

In [None]:
label = LabelEncoder()
bank_object_data = bank_object_data.apply(label.fit_transform)

In [None]:
bank_object_data.head()

Merging the converted table and the actual table

In [None]:
bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)

In [None]:
bank_final.head()

Running the describe function by splitting the tables because we can't view all the values in a single stretch

In [None]:
bank_final.iloc[:, 0:15].describe()

In [None]:
bank_final.iloc[:, 14:].describe()

Found some unusual values in the feature "Campaign", "age", "duration"

In [None]:
sns.distplot(bank_final['age'].dropna(),bins=30)

In [None]:
plt.boxplot(bank_final['age'])

In [None]:
bank_final['age'] = np.log(bank_final['age'])
plt.boxplot(bank_final['age'])

In [None]:
sns.distplot(bank_final['age'].dropna(),bins=30)

In [None]:
plt.boxplot(bank_final['duration'])

In [None]:
bank_final['duration'] = pow(bank_final['duration'],1/3)   # Cube root
bank_final['duration'].max()

In [None]:
plt.boxplot(bank_final['duration'])

In [None]:
plt.boxplot(bank_final['campaign'])

In [None]:
dummy = pow(bank_final['campaign'],1/15)  # root of 15, but not working as expected
plt.boxplot(dummy)

In [None]:
dummy.hist()

In [None]:
bank_final['campaign'] = np.log(bank_final['campaign'])

In [None]:
plt.boxplot(bank_final['campaign'])

In [None]:
bank_final['campaign'].hist()

In [None]:
bank_final.corr() # Pearson Co-relation

In [None]:
bank_final.corr().nlargest(10,['y'])['y']

In [None]:
bank_final.corr().nsmallest(21,['y'])['y']

Splitting X and Y values for train and test

In [None]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

In [None]:
X.max()

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size = 0.3, random_state = 1)

# ROC Receiver Operator Characteristics

In [None]:
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(ytest))]
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(xtrain, ytrain)
# predict probabilities
lr_probs = model.predict_proba(xtest)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(ytest, ns_probs)
lr_auc = roc_auc_score(ytest, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(ytest, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(ytest, lr_probs)
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

# Naive baiyes algorithm

In [None]:
model = GaussianNB()
model.fit(xtrain, ytrain)
ypredictNaiveBaiyes = model.predict(xtest)

In [None]:
accuracy_score(ytest, ypredictNaiveBaiyes)

In [None]:
model.score(xtrain, ytrain)

In [None]:
confusion_matrix(ytest, ypredictNaiveBaiyes)

In [None]:
print(classification_report(ytest, ypredictNaiveBaiyes))

@@@ Check the attachment file @@@
# Classification report
Assume the problem of predicting the heart disease
# Precision
**Precision is the ability of a classifier not to label an instance positive that is actually negative. Accuracy of model not classifying the person as heart patient when he don't have heart disese. Precision can be thought of as a measure of a classifiers exactness. A low precision can also indicate a large number of False Positives.**
For each class it is defined as the ratio of true positives to the sum of true and false positives.
TN / True Negative: when a case was negative and predicted negative
TP / True Positive: when a case was positive and predicted positive
FN / False Negative: when a case was positive but predicted negative
FP / False Positive: when a case was negative but predicted positive
Precision – Accuracy of positive predictions.
Precision = TP/(TP + FP)

# Recall (Sensitivity)
**Sensitivity tells us what percentage of people with heart disease were actually correctly identified.**
**Recall is the ability of a classifier to find all positive instances.
Recall can be thought of as a measure of a classifiers completeness. A low recall indicates many False Negatives.**For each class it is defined as the ratio of true positives to the sum of true positives and false negatives.
FN – False Negatives
Recall: Fraction of positives that were correctly identified.
Recall = TP/(TP+FN)

# Specificity
**Specificity tells us what percentage of people without heart disease were actually correctly identified.**
Specificity = TN/(TN+FP)

# F1 score
The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0. Generally speaking, F1 scores are lower than accuracy measures as they embed precision and recall into their computation. As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.
F1 Score = 2*(Recall * Precision) / (Recall + Precision)

# Accuracy score
The most common metric for classification is accuracy, which is the fraction of samples predicted correctly as shown below
Accuracy = (TP+TN)/(TP+TN+FP+FN)

In [None]:
# Image("/kaggle/input/classification-report/Classification_report.jpg")

In [None]:
# from PIL import Image as Imagepil
# image_obj = Imagepil.open("/kaggle/input/classification-report/Classification_report.jpg")
# rotated_image = image_obj.rotate(-90)
# rotated_image.save("rotate.jpg")
# Image("/kaggle/working/rotate.jpg")

# Checking bias of the output value

In [None]:
sns.countplot(x='y', data = bank_final)

# Using Stratified sampling technique to elimate the bias

In [None]:
model2 = GaussianNB()
accuracy_scores_stratified_sampling = []
precision_scores = []
recall_scores = []
f1_scores = []
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(X, Y):
    x_train, x_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = Y.loc[train_index], Y.loc[test_index]
    model2.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy_scores_stratified_sampling.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

In [None]:
print("accuracy_scores: ",accuracy_scores_stratified_sampling,"\nprecision_scores: ",precision_scores,"\nrecall_scores: ",recall_scores,"\nf1_scores: ",f1_scores)

# KNN algorithm
@@@ Check the attachment file @@@

In [None]:
k=np.arange(1,25)
train_accuracy=np.empty(len(k))
test_accuracy=np.empty(len(k))

In [None]:
for i,k in enumerate(k):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain,ytrain)
    train_accuracy[i]=knn.score(xtrain,ytrain)
    test_accuracy[i]=knn.score(xtest,ytest)
    print(train_accuracy[i])
    print(test_accuracy[i])

In [None]:
print(test_accuracy, train_accuracy)

In [None]:
k=np.arange(1,25)
plt.subplot(1,2,1)
plt.plot(k,train_accuracy)
plt.xlabel('numbers of neighbor')
plt.ylabel('train accuracy')
plt.show()
plt.subplot(1,2,2)
plt.plot(k,test_accuracy)
plt.ylabel('test accuracy')
plt.show()

In [None]:
idx=np.where(test_accuracy==max(test_accuracy))
x=k[idx]
x[0]

In [None]:
model=KNeighborsClassifier(n_neighbors=x[0],p=1)
model.fit(xtrain,ytrain)

In [None]:
ypredictKNN=model.predict(xtest)
accuracy_score(ytest,ypredictKNN)

In [None]:
confusion_matrix(ytest,ypredictKNN)

In [None]:
print(classification_report(ytest,ypredictKNN))

# Bagging technique with KNN algorithm

In [None]:
model=KNeighborsClassifier()

In [None]:
bagging=BaggingClassifier(model,max_samples=.5,max_features=2,oob_score=True)

In [None]:
bagging.fit(X,Y)

In [None]:
bagging.oob_score_

In [None]:
Bagging_score = bagging.score(X, Y)

# ADA boosting

In [None]:
ada_boost=AdaBoostClassifier(base_estimator=None,n_estimators=100)

In [None]:
ada_boost.fit(X,Y)

In [None]:
ada_boost.score(X, Y)

# Gradient boosting

In [None]:
gradient_boost = GradientBoostingClassifier(n_estimators=20)

In [None]:
gradient_boost.fit(X, Y)

In [None]:
gradient_boost.score(X, Y)

# Voting classifier

In [None]:
dt=DecisionTreeClassifier()
lr=LogisticRegression()
knn=KNeighborsClassifier()

In [None]:
voting = VotingClassifier(estimators=[('dt',dt),('lr',lr),('knn',knn)],voting='hard')

In [None]:
voting.fit(xtrain,ytrain)

In [None]:
ypredictVoting = voting.predict(xtest)

In [None]:
accuracy_score(ytest, ypredictVoting)

# Logistic regression
@@@ Check the attachment file @@@

In [None]:
classifier = LogisticRegression()
classifier.fit(xtrain, ytrain)

In [None]:
ypredictLogisticRegression = classifier.predict(xtest)

In [None]:
classifier.score(xtest,ytest)

In [None]:
classifier.score(xtrain, ytrain)

# Feature selection
**Stratified K fold**

In [None]:
rfecv = RFECV(estimator=classifier, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(xtrain, ytrain)

In [None]:
print("Optimal number of features : %d" % rfecv.n_features_)

In [None]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
rfe = RFE(classifier, rfecv.n_features_)
rfe = rfe.fit(xtrain, ytrain)
print(list(X.columns[rfe.support_]))

**Lasso regression**
@@@ Check the attachment file @@@

In [None]:
l=Lasso(alpha=0.1)
co=l.fit(X, Y)

In [None]:
co.coef_   # Slope(M) value

In [None]:
names=X.columns
names

In [None]:
plt.plot(range(len(names)),co.coef_)
plt.xticks(range(len(names)),names,rotation=60)
plt.ylabel('coefficints')     # Higher the value contain high co-relation

# Decision tree
@@@ Check the attachment file @@@

In [None]:
model=DecisionTreeClassifier()
model

In [None]:
model.fit(xtrain, ytrain)
ypredictDecisionTree=model.predict(xtest)

In [None]:
model.score(xtrain, ytrain)

In [None]:
accuracy_score(y_test,ypredictDecisionTree)

# Random forest

In [None]:
model=RandomForestClassifier()
model

In [None]:
model.fit(xtrain, ytrain)
ypredictRandomForest = model.predict(xtest)

In [None]:
accuracy_score(ytest, ypredictRandomForest)

# XGBoost

In [None]:
model = XGBClassifier()
model.fit(xtrain, ytrain)

In [None]:
y_pred = model.predict(xtest)
ypredictXGBoost = [round(value) for value in y_pred]

In [None]:
accuracy = accuracy_score(ytest, ypredictXGBoost)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
model.score(xtrain,ytrain)

# Support Vector Machine

In [None]:
svclassifier = SVC()
svclassifier.fit(xtrain, ytrain)

In [None]:
ypredictSVM = svclassifier.predict(xtest)

In [None]:
svclassifier.score(xtrain, ytrain)

In [None]:
accuracy_score(ytest, ypredictSVM)

# Principle Component Analysis (PCA)
https://stackabuse.com/implementing-pca-in-python-with-scikit-learn/

In [None]:
sc = StandardScaler()
Xtrain = sc.fit_transform(xtrain)
Xtest = sc.transform(xtest)

In [None]:
pca = PCA()
Xtrain0 = pca.fit_transform(Xtrain)
Xtest0 = pca.transform(Xtest)

In [None]:
# Variance value of the features
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
# Using one PCA component
pca1 = PCA(n_components=1)
Xtrain1 = pca1.fit_transform(Xtrain)
Xtest1 = pca1.transform(Xtest)

In [None]:
# Random forest algorithm
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(Xtrain1, ytrain)

# Predicting the Test set results
ypredictPCA1 = classifier.predict(Xtest1)

In [None]:
print(confusion_matrix(ytest, ypredictPCA1))
accuracy_score(ytest, ypredictPCA1)

In [None]:
# Using three PCA component
pca3 = PCA(n_components=3)
Xtrain3 = pca3.fit_transform(Xtrain)
Xtest3 = pca3.transform(Xtest)

In [None]:
# Random forest algorithm
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(Xtrain3, ytrain)

# Predicting the Test set results
ypredictPCA3 = classifier.predict(Xtest3)

In [None]:
print(confusion_matrix(ytest, ypredictPCA3))
accuracy_score(ytest, ypredictPCA3)

# Isolataion forest (Anomaly detection)
* Using own dataset because banking dataset can't be use for it
* https://lambda.grofers.com/anomaly-detection-using-isolation-forest-80b3a3d1a9d8
* https://towardsdatascience.com/outlier-detection-with-isolation-forest-3d190448d45e

In [None]:
# Generating data ----

rng = np.random.RandomState(42)

# Generating training data 
X_train = 0.2 * rng.randn(1000, 2)
X_train = np.r_[X_train + 3, X_train]
X_train = pd.DataFrame(X_train, columns = ['x1', 'x2'])

# Generating new, 'normal' observation
X_test = 0.2 * rng.randn(200, 2)
X_test = np.r_[X_test + 3, X_test]
X_test = pd.DataFrame(X_test, columns = ['x1', 'x2'])

# Generating outliers
X_outliers = rng.uniform(low=-1, high=5, size=(50, 2))
X_outliers = pd.DataFrame(X_outliers, columns = ['x1', 'x2'])

In [None]:
# Isolation Forest

# training the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)

# predictions
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

In [None]:
# new, 'normal' observations 
# print("Accuracy:", list(y_pred_test).count(1)/y_pred_test.shape[0])
print("Accuracy:", list(y_pred_outliers).count(-1)/y_pred_outliers.shape[0])

# Comparing accuracy between the algorithms

In [None]:
print("Extreme Gradient Boost       -> ", float(accuracy_score(ytest, ypredictXGBoost)*100))
print("Random forest                -> ", float(accuracy_score(ytest, ypredictRandomForest)*100))
print("Gradient boosting            -> ", float(gradient_boost.score(X, Y))*100)
print("ADA Boosting                 -> ", float(ada_boost.score(X, Y))*100)
print("Voting classifier            -> ", float(accuracy_score(ytest, ypredictVoting))*100)
print("K Nearest Neighbour          -> ", float(accuracy_score(ytest, ypredictKNN)*100))
print("Logistic regression          -> ", float(accuracy_score(ytest, ypredictLogisticRegression)*100))
print("Support Vector Machine       -> ", float(accuracy_score(ytest, ypredictSVM)*100))
print("Principle Component Analysis -> ", float(accuracy_score(ytest, ypredictPCA3)*100))
print("Bagging with KNN             -> ", float(Bagging_score)*100)
print("Decision tree                -> ", float(accuracy_score(ytest, ypredictDecisionTree)*100))
print("Naive baiyes                 -> ", float(accuracy_score(ytest, ypredictNaiveBaiyes)*100))
print("Stratified Sampling          -> ", float(accuracy_scores_stratified_sampling[0])*100)