<a href="https://colab.research.google.com/github/sadullahmath/SeaBorn/blob/master/ML_Pipeline_for_Modeling_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML Pipeline for Modeling and Prediction

In [0]:
import pandas as pd

In [0]:
#Loading data from the GitHub repository to colab notebook
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data '

In [43]:
credData = pd.read_csv(filename,sep= ",",header = None, na_values = "?")
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [44]:
# Changing the Classes to 1 & 0
credData.loc[credData[15] == '+ ' , 15] = 1
credData.loc[credData[15] == '- ' , 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [45]:
# Finding number of null values in the data set
credData.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [0]:
# Dropping all the rows with na values
newcred = credData.dropna(axis = 0)

In [47]:
# Printing the shape of earlier data set and new data set
print(credData.shape)
print(newcred.shape)

(690, 16)
(653, 16)


In [48]:
# Separating X and y variables
X = newcred.loc[:,0:14]
print(X.shape)

y = newcred.loc[:,15]
print(y.shape)

(653, 15)
(653,)


In [49]:
from sklearn.model_selection import train_test_split
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

print(X_train.shape)
print(X_test.shape)

(457, 15)
(196, 15)


In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [0]:
# Pipeline for transforming categorical variables
catTransformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [0]:
# Pipeline for scaling numerical variables
numTransformer = Pipeline(steps=[('scaler', StandardScaler())])

In [53]:
# Printing dtypes for X
X.dtypes

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
dtype: object

In [54]:
# Selecting numerical features
numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
numFeatures

Int64Index([1, 2, 7, 10, 13, 14], dtype='int64')

In [55]:
# Selecting Categorical features
catFeatures = X.select_dtypes(include=['object']).columns
catFeatures

Int64Index([0, 3, 4, 5, 6, 8, 9, 11, 12], dtype='int64')

In [0]:
# Creating the preprocessing engine
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures)])

In [57]:
# Transforming the Training data
Xtran_train = pd.DataFrame(preprocessor.fit_transform(X_train))
print(Xtran_train.shape)
Xtran_train.head()

(457, 46)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45
0,0.105658,-0.4449,1.377002,-0.553206,0.570065,-0.174241,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-1.084238,1.115032,-0.528306,-0.553206,-0.60247,-0.167337,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.416675,-0.080916,0.592889,-0.327276,-0.367963,-0.174241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,-0.795428,1.418699,-0.189778,-0.553206,-0.485217,0.024974,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-1.125497,0.439061,-0.636809,-0.553206,-0.25071,-0.174241,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [58]:
# Transforming Test data
Xtran_test = pd.DataFrame(preprocessor.transform(X_test))
print(Xtran_test.shape)
Xtran_test.head()

(196, 46)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45
0,-0.059376,-0.531217,-0.623789,-0.553206,0.687319,-0.174241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-1.063609,-0.878562,-0.600642,-0.327276,0.101051,-0.174076,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,0.64862,1.929316,1.847181,0.802371,-0.661097,-0.174241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,2.203242,3.402933,2.245025,2.383877,-1.071485,0.927028,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4,-0.451332,-0.644572,-0.612215,-0.553206,-0.485217,-0.174241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [0]:
# Importing PCA library
from sklearn.decomposition import PCA

In [0]:
# Creating an estimator with both preprocessor and dimensionality reduction
estimator = Pipeline(steps=[('preprocessor', preprocessor), ('dimred', PCA(10))])

In [64]:
# Fitting and transforming Training set
Xtran_train = pd.DataFrame(estimator.fit_transform(X_train))

print(Xtran_train.shape)
Xtran_train.head()

(457, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.456911,0.857577,-1.231989,0.902396,1.604191,-0.284921,-0.595444,0.206836,0.027712,0.742267
1,-0.758102,-1.279315,1.162158,0.397572,0.031973,1.236864,0.353098,-0.020558,0.561482,0.613476
2,0.387754,-0.022255,-0.082482,-0.524931,0.0893,0.300113,-1.25766,-0.191124,-0.376516,-0.367365
3,-0.332061,-0.636192,0.825248,0.798001,0.435375,1.377995,-0.578766,0.030524,-0.900729,0.620234
4,-1.41278,-0.707406,0.607928,0.54958,1.582078,-0.11971,0.496112,0.597986,-0.133551,0.032972


In [65]:
# Transforming test set
Xtran_test = pd.DataFrame(estimator.transform(X_test))
print(Xtran_test.shape)
Xtran_test.head()

(196, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.299051,0.187772,-0.23137,0.112879,-0.484604,0.369499,0.28216,1.09115,-0.062456,0.077569
1,-1.494398,-0.200785,0.231369,-0.60963,1.235941,-1.063417,0.259277,0.779575,0.086378,0.07871
2,2.829701,-0.298786,-0.099139,0.24561,0.638466,0.991274,-0.769735,0.040185,-0.614251,0.164817
3,5.259748,-0.456795,0.789554,1.150056,-0.033996,0.487041,1.095085,-0.113758,0.515659,0.520806
4,-1.31073,-0.695854,0.14146,0.215672,-0.506067,0.058389,-0.324188,0.963671,0.032933,0.043535


In [0]:
# Importing necessary libraries
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [0]:
# Creating the estimator pipeline for model building
estimator = Pipeline(steps=[('preprocessor', preprocessor),('dimred', PCA(10)),('clf',LogisticRegression(random_state=123))])

In [68]:
# Fitting the modelling pipeline on the training set
estimator.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  Int64Index([1, 2, 7, 10, 13, 14], dtype='int64')),
                                                 ('categoric',
                     

In [69]:
# Creating the score on the test set
estimator.score(X_test, y_test)

0.8877551020408163

In [0]:
# Generating the predictions on test set
pred = estimator.predict(X_test)

In [71]:
# Printing the classification report
from sklearn.metrics import classification_report
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           +       0.88      0.88      0.88        89
           -       0.90      0.90      0.90       107

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



In [72]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

[[78 11]
 [11 96]]
