<a href="https://colab.research.google.com/github/revanthmadasu/machine-learning/blob/nallam-project/nallam-project1/Project_1_scoring_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Scoring
Write function that will load artifacts from above, transform and score on a new dataset. Your function should return Python list of labels. For example: [0,1,0,1,1,0,0]

In [15]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.2 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


### Artifact Class
In order to create object from artifact, python needs the class definition. 
This provides definition to python to create model_formation object from artifact

In [18]:
from category_encoders import BinaryEncoder
import pandas as pd
class ModelFormation:
    def process_encode_data(self, data):
        processed_data = data.loc[:,data.columns!='index']

        def dollar_string_to_float(dollar_str):
          dollar_str = dollar_str.replace(',', '')
          dollar_float = float(dollar_str[1:])
          return dollar_float
        # converting dollar to float
        currency_cols = ['DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']
        for colname in currency_cols:
          processed_data[colname] = processed_data[colname].apply(dollar_string_to_float)

        processed_data = processed_data.dropna()
        
        processed_data['MIS_Status'] = processed_data['MIS_Status'].replace('P I F', 0)
        processed_data['MIS_Status'] = processed_data['MIS_Status'].replace('CHGOFF', 1)

        processed_data['RevLineCr'] = processed_data['RevLineCr'].astype('category')

        processed_data['RevLineCr'] = processed_data['RevLineCr'].replace('N', 0)
        processed_data['RevLineCr'] = processed_data['RevLineCr'].replace('0', 0)
        processed_data['RevLineCr'] = processed_data['RevLineCr'].replace('Y', 1)
        for remove_cat in processed_data['RevLineCr'].cat.categories.tolist():
          if remove_cat not in [0,1]:
            processed_data = processed_data[processed_data['RevLineCr'] != remove_cat]
            processed_data['RevLineCr'] = processed_data['RevLineCr'].cat.remove_categories(remove_cat)

        processed_data['NewExist'] = processed_data['NewExist'].replace(1, 0)
        processed_data['NewExist'] = processed_data['NewExist'].replace(2, 1)
        
        processed_data['LowDoc'] = processed_data['LowDoc'].astype('category')
        processed_data['LowDoc'] = processed_data['LowDoc'].replace('Y', 1)
        processed_data['LowDoc'] = processed_data['LowDoc'].replace('N', 0)
        processed_data['LowDoc'] = processed_data['LowDoc'].replace('0', 0)
        processed_data['LowDoc'] = processed_data['LowDoc'].replace('1', 1)
        for remove_cat in processed_data['LowDoc'].cat.categories.tolist():
          if remove_cat not in [0,1]:
            processed_data = processed_data[processed_data['LowDoc'] != remove_cat]
            processed_data['LowDoc'] = processed_data['LowDoc'].cat.remove_categories(remove_cat)
        processed_data['LowDoc'].value_counts()

        s_processed_data = processed_data.sample(frac=0.5, random_state=27)

        cat_cols_bin_en = ['City', 'State', 'Bank', 'BankState', 'Zip', 'NAICS', 'UrbanRural']

        from category_encoders import BinaryEncoder
        import pandas as pd
        bin_encoder = BinaryEncoder(cols=cat_cols_bin_en)
        bin_encoded_data = bin_encoder.fit_transform(processed_data)

        return bin_encoded_data

    def test_train_split(self, encoded_data):
        from sklearn.model_selection import train_test_split

        X = encoded_data.iloc[:, :-1].values
        y = encoded_data.iloc[:, -1].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=44)

        self.x_cols_to_score = encoded_data.iloc[:, :-1].columns

        return X_train, X_test, y_train, y_test
    def get_cols_to_score(self, start=66, end=-1):
      return self.x_cols_to_score[start:end]

    def get_model(self, data):
      encoded_data = self.process_encode_data(data)
      X_train, X_test, y_train, y_test = self.test_train_split(encoded_data)
      from sklearn.linear_model import LogisticRegression

      lr_model = LogisticRegression(C= 1.0, fit_intercept= False, max_iter= 100, penalty= 'l2', solver='newton-cg')
      lr_model.fit(X_train, y_train)

      return lr_model


    def __init__(self, train_model = False, data = None):
      # remove index
      if train_model:
        self.trained_model = self.get_model(data)


In [57]:
def project_1_scoring(data):
    """
    Function to score input dataset.
    
    Input: dataset in Pandas DataFrame format
    Output: Python list of labels in the same order as input records
    
    Flow:
        - Load artifacts
        - Transform dataset
        - Score dataset
        - Return labels
    
    """
    from sklearn.preprocessing import OneHotEncoder
    from copy import deepcopy
    from sklearn.linear_model import LogisticRegression
    import pickle
    import numpy as np
    import pandas as pd
    
    X = data.copy()
    
    '''Load Artifacts'''
    artifacts_dict_file = open("./artifacts/artifacts_dict_file.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)

    model_formation = artifacts_dict['model']
    X_train, X_test, y_train, y_test = model_formation.test_train_split(model_formation.process_encode_data(data))
    indices = range(len(X_test))
    clf = model_formation.trained_model
    threshold = 0.5
    cols_to_score = list(range(len(model_formation.x_cols_to_score)))
    y_pred_proba = clf.predict_proba(X_test)
    y_pred = (y_pred_proba[:,0] < threshold).astype(np.int16)
    d = {"index": indices,
         "label":y_pred,
         "probability_0":y_pred_proba[:,0],
         "probability_1":y_pred_proba[:,1]}
    
    return pd.DataFrame(d)

In [59]:
data = pd.read_csv('./data/SBA_loans_project_1.zip')
project_1_scoring(data).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data[colname] = processed_data[colname].apply(dollar_string_to_float)


Unnamed: 0,index,label,probability_0,probability_1
0,0,0,0.700336,0.299664
1,1,0,0.556826,0.443174
2,2,0,0.856169,0.143831
3,3,0,0.774973,0.225027
4,4,0,0.821729,0.178271
