In [93]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer # to fill the missing the value
from sklearn.preprocessing import LabelEncoder,  MinMaxScaler # labelencoder to encode the catergorical data 

In [94]:
raw_train = pd.read_csv('data/train.csv')
raw_test = pd.read_csv('data/test.csv')

In [95]:
raw_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [96]:
raw_train.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [97]:
raw_train.shape

(614, 13)

In [98]:
train_df = raw_train.copy()
test_df = raw_test.copy()

In [99]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [100]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            362 non-null    object 
 1   Gender             351 non-null    object 
 2   Married            362 non-null    object 
 3   Dependents         353 non-null    object 
 4   Education          362 non-null    object 
 5   Self_Employed      339 non-null    object 
 6   ApplicantIncome    362 non-null    int64  
 7   CoapplicantIncome  362 non-null    int64  
 8   LoanAmount         362 non-null    int64  
 9   Loan_Amount_Term   356 non-null    float64
 10  Credit_History     333 non-null    float64
 11  Property_Area      362 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 34.1+ KB


In [101]:
train_y = train_df['Loan_Status'].copy()

In [102]:
train_df.drop(columns=['Loan_Status'], inplace=True)

In [103]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [104]:
# dropping the uneccesary column
train_df.drop(columns=['Loan_ID'], inplace=True)
test_df.drop(columns=['Loan_ID'], inplace=True)

In [105]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [106]:
# Duplicates --> no duplicates
train_df[train_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [107]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [108]:
test_df.drop_duplicates(inplace=True)

In [109]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [110]:
# Missing value analysis
train_df.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [111]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [112]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [113]:
train_df.nunique()

Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
dtype: int64

In [114]:
# Numeric columns will be filed with mean
# Categorical columns will be filled with the mode value
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']

In [115]:
cat_imputer = SimpleImputer(strategy="most_frequent")
cat_imputer.fit_transform(train_df[cat_cols])

train_df[cat_cols] = cat_imputer.transform(train_df[cat_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

In [116]:
num_imputer = SimpleImputer(strategy='mean')
num_imputer.fit_transform(train_df[num_cols])

train_df[num_cols] = num_imputer.transform(train_df[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

In [117]:
# check for missing values again
train_df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [118]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban


In [119]:
# preprocessing on combining the applicant income and coapplicant income
train_df['ApplicantIncome'] = train_df['ApplicantIncome']+train_df['CoapplicantIncome']
test_df['ApplicantIncome'] = test_df['ApplicantIncome']+test_df['CoapplicantIncome']
train_df.drop(columns='CoapplicantIncome', inplace=True)
test_df.drop(columns='CoapplicantIncome', inplace=True)

In [120]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,146.412162,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,6091.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,4941.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,141.0,360.0,1.0,Urban


In [121]:
# Application of the label Encoder
train_df.nunique()

Gender                2
Married               2
Dependents            4
Education             2
Self_Employed         2
ApplicantIncome     554
LoanAmount          204
Loan_Amount_Term     11
Credit_History        2
Property_Area         3
dtype: int64

In [122]:
train_df.Dependents.unique() #ordinal data i am good to applu label encoder
# if nominal data then one hot encoder is used

array(['0', '1', '2', '3+'], dtype=object)

In [123]:
train_df.Property_Area.unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [124]:
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.fit_transform(test_df[col])

In [125]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,146.412162,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


In [126]:
train_df.Dependents.unique()

array([0, 1, 2, 3])

In [127]:
num_cols.remove('CoapplicantIncome')

In [128]:
# application of log transformation
train_df[num_cols] = np.log(train_df[num_cols])
test_df[num_cols] = np.log(test_df[num_cols])

In [129]:
# scalling
minmax = MinMaxScaler()
train_df = minmax.fit_transform(train_df)
test_df = minmax.fit_transform(test_df)

In [132]:
# Building the model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, train_y, test_size=0.3, random_state=0)

In [133]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train, y_train)

In [134]:
y_pred_test = log.predict(X_test)

In [136]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred_test)
print("accuracy: ", acc)

accuracy:  0.827027027027027


# Challenges in jupyter notebook
1. Use it as part of my software/product
2. ensure the version used in experiment/training - same to be used in production environment.
solution - create virtual environment
3. Working in two different project's. let say project A - I need to use the python 3.5 and numpy library of 1.5
- let say for project B - I need to use the python 3.10 and numpy 1.7
when I want to use the specific set of version of same libraray use virtual environment venvA and venvB and it will ensure that it is insync with the required requirements

# Dependencies to be taken care
1. Some also need the same set of libraries to run our code how to ensure that the dependecies to be taken care by creating a requirement file which is actually a text file which has all the packages with specific version mentioned required for the project to run.
"pip freeze > requirements.txt"

For other team they need to run "pip install -r requirements.txt"

# Serialization and Deserialization of ML Models
1. It is a process in which the python hierarchy is converted into a byte stream
2. In other words, whatever the variables that i have created or the model object will be storing it as a file
3. By creating it as file will help us in transferring that saved model from one env to another env
4. To perform serialization and desearilzation we have a library joblib also pickel can be used
5. Save the object as external file and load the file as external object

In [140]:
# serialization and deserialization 
import joblib

joblib.dump(log, 'my_trained_model_v1.pkl') #extension can be anything

['my_trained_model_v1.pkl']

In [139]:
# deserialization
final_model = joblib.load('my_trained_model_v1.pkl')

In [142]:
final_model.intercept_, final_model.coef_

(array([-2.05555765]),
 array([[ 0.02418858,  0.35465295,  0.39582558, -0.44377574, -0.04824404,
          0.0332581 , -0.82294892,  0.2392391 ,  3.17979337,  0.22542767]]))

In [143]:
log.intercept_, log.coef_

(array([-2.05555765]),
 array([[ 0.02418858,  0.35465295,  0.39582558, -0.44377574, -0.04824404,
          0.0332581 , -0.82294892,  0.2392391 ,  3.17979337,  0.22542767]]))

# Sharing file for deployment Or testing the python code (Pytest)
1. After the above process when we share the model file to deployment team
2. The deployment team may or maynot know about the preprocessing, whether model is wokring or not, or the data is valid one or not
3. Along with sharing the model we have to create a test script for the model file.
4. We have to share the test script to  validate the model is working or not
5. It will be a python script file with .py extension
6. The python notebook has all the setp required for model building till prediction and evaulation
7. Notebook can not be used in production environment


# Challenges in using notebook in production
1. When we are working in production env we work in serves which are based on linus=x system 
2. I don't have the UI to see what is happeing over there
3. This makes it very difficult to debug
4. If we face some issue in notebook then it actually require changes in multiple place
5. Lot of dependencies
6. There is no modularity on the code.
7. Modularity is splitting code in small chunks of code.
8. Confilct of variables and function
9. Duplicates code snippets

Solution - 
1. Write python scripts
2. Follow modular approach 
3. Create package

# Modular Programming
Module in python
    A python file that can hold classes, functions and variables 

Packages in python
    1. One or more modules, such that they are interlinked with each other
    2. A directory with subdirectories can be called a package if it contains __init__.py file

# Deployment Questions
1. Who are all the end users of this application
2. whether our app will be installed on other developers software development in case if they test it
3. where it will run

## Package and Module

In [144]:
import PackageA

In [149]:
from PackageA import f1, f2

In [147]:
f1.print_something()

'output from f1'

In [150]:
f2.print_something()

'output from f2'

In [151]:
from PackageA.f1 import print_something as f1p

In [152]:
f1p()

'output from f1'

In [153]:
from PackageA.SubPackageA import f3

In [154]:
f3.print_something()

'output from f3'

In [155]:
from PackageA.SubpackageB import f5

In [156]:
f5.print_something()

'output from f5'

# Which Dir system is looking

In [157]:
import sys

In [158]:
sys.path

['/home/codespace/.python/current/lib/python310.zip',
 '/home/codespace/.python/current/lib/python3.10',
 '/home/codespace/.python/current/lib/python3.10/lib-dynload',
 '',
 '/home/codespace/.local/lib/python3.10/site-packages',
 '/home/codespace/.python/current/lib/python3.10/site-packages']

In [161]:
# to add personal dir append it
# sys.path.append('/home/codespace/.python/current/lib')

# Building the ML Model Package
1. Create package 
2. Maintain the seperate modules
3. Maintain seperate files for preprocessing, data handling, manual configurations etc
4. Build the test cases - verify the integrity
5. we will be using the setuptools
6. Twine installation. This is required to upload our project to PyPi
7. Setup.py is the most import file
8. readme.md file most important
9. manifest.in is needed when need to package additional files that are not automatically availble in source distribution
10. Manifest file consists commands one per line, instructing setuptools to add or remove some set of files from the sdist (source distrubution).
11. The sdist generally consists of the archives of the file such as data files and setup..py file in a compressed tar format


# Practices to follow in Modularlization
1. Wherever i mentioned customed input put that in the configuration file.
2. Congiguration file as a module in .py file
3. data folder, train model location , test file name etc.
4. 