<a href="https://colab.research.google.com/github/skjsourabh95/Machine-Learning/blob/master/RiskRatingModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load the Drive helper and mount
from google.colab import drive
#mounting the drive to get access to the files
drive.mount('/content/drive',force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
# model building and pretesting with different classification models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
import json
%matplotlib inline

In [0]:
## Training the model

In [75]:
# Reading the dataset from a CSV file

data = pd.read_csv('/content/drive/My Drive/Data/modeling_inp.csv', encoding='utf-8')

#preprocessing

data.drop(['CUSTTYPE', 'STATUS','TXDATE_year'], axis=1,inplace=True)
data.drop(['CUSTNO.1', 'CUSTNO','ACCTNO'], axis=1,inplace=True)
data[['expValCashDepositsLow','expValCashDepositsHigh']] = data['expValCashDeposits'].str.split('-',1, expand=True)
data[['expTotalDepositsLow','expTotalDepositsHigh']] = data['expTotalDeposits'].str.split('-',1, expand=True)
data.drop(['expValCashDeposits','expTotalDeposits'], axis=1,inplace=True)
data[['expValCashDepositsLow','expValCashDepositsHigh','expTotalDepositsLow','expTotalDepositsHigh']] = data[['expValCashDepositsLow','expValCashDepositsHigh','expTotalDepositsLow','expTotalDepositsHigh']].apply(pd.to_numeric)
data['beneficiaryCountry'].fillna('No', inplace = True)

all_data=data.columns
numeric=data._get_numeric_data().columns
categorical=list(set(all_data)-set(numeric))
cols_need_mapped = categorical

mapper = {col: {cat: n for n, cat in enumerate(data[col].astype('category').cat.categories)} for col in data[cols_need_mapped]}

#saving the mapper in a dictionary
with open('categoricalEncoding.json', 'w') as fp:
    json.dump(mapper,fp)

for c in cols_need_mapped :
    data[c] = data[c].map(mapper[c])

sc = StandardScaler()
X =  pd.DataFrame(sc.fit_transform(data),columns=data.columns)
#spliting the dataset

X=data.loc[:, data.columns != 'riskRating']
y=data['riskRating']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=45, stratify=y)

abc=[]
classifiers=["Nearest Neighbors", "Linear SVM","Decision Tree","Random Forest","Naive Bayes","QDA","LogisticRegression"]
models=[
    KNeighborsClassifier(),## implements learning based on the k nearest neighbors of each query point, where k is an integer value specified by the user
    LinearSVC(),## implements “one-vs-the-rest” multi-class strategy, thus training n_class models
    DecisionTreeClassifier(max_depth = 3),## model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
    RandomForestClassifier(max_depth = 3),## fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting
    GaussianNB(),## implements the Gaussian Naive Bayes algorithm for classification
    QuadraticDiscriminantAnalysis(),## A classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.
    LogisticRegression()## implements regularized logistic regression using the ‘liblinear’ library
]
for i in models:
    model = i
    model.fit(X_train, y_train)
    abc.append(model.score(X_test,y_test))## score matches the results in y_test and y_pred and counts how many of them where true out of the total values
classifiers, abc,models = (list(t) for t in zip(*sorted(zip(classifiers, abc,models),key=lambda pair: pair[1],reverse=True)))    
models_dataframe=pd.DataFrame(abc,index=classifiers)   
models_dataframe.columns=['Score']

## choosing the best scroring model
model=models[0]
##viewing the model scores
models_dataframe


Unnamed: 0,Score
Decision Tree,0.892744
Random Forest,0.798107
LogisticRegression,0.726604
Nearest Neighbors,0.644585
QDA,0.640379
Naive Bayes,0.623554
Linear SVM,0.609884


In [76]:
# Save model in the current working directory
joblib_file = "risk_model.pkl"  
joblib.dump(model, joblib_file)

['risk_model.pkl']

In [77]:
with open('categoricalEncoding.json', 'r') as fp:
    encoding = json.load(fp)
encoding

{'CASHFLOWTYPE': {'Cr': 0, 'Dr': 1},
 'COMPANYCAT': {'Private': 0, 'Public': 1},
 'TYPEOFBUSINESS': {'Gems and Jewellery': 0, 'IT': 1},
 'addressBusinessCountry': {'India': 0, 'United States of America': 1},
 'beneficiaryCountry': {'India': 0, 'No': 1, 'USA': 2},
 'countryIncorporation': {'India': 0, 'United States of America': 1},
 'countryTaxResidency': {'India': 0, 'United States of America': 1},
 'prodCategory': {'Current': 0, 'Loan': 1},
 'prodType': {'Procurement of Inventory': 0,
  'Receivables': 1,
  'Working Capital': 2},
 'productRisk': {'Low': 0, 'Medium': 1, 'Medium High': 2},
 'txnChannel': {'Branch': 0, 'Internet': 1},
 'txnMode': {'Cash': 0, 'Funds Transfer': 1, 'Payments': 2}}

In [0]:
## Making a wrapper pre-processing function to preprocess the new data and mapping its categorical columns
def  preprocess(data,mapper):
          
      data.drop(['CUSTTYPE', 'STATUS','TXDATE_year'], axis=1,inplace=True)
      data.drop(['CUSTNO.1', 'CUSTNO','ACCTNO'], axis=1,inplace=True)
      
      data[['expValCashDepositsLow','expValCashDepositsHigh']] = data['expValCashDeposits'].str.split('-',1, expand=True)
      data[['expTotalDepositsLow','expTotalDepositsHigh']] = data['expTotalDeposits'].str.split('-',1, expand=True)
      data.drop(['expValCashDeposits','expTotalDeposits'], axis=1,inplace=True)
      data[['expValCashDepositsLow','expValCashDepositsHigh','expTotalDepositsLow','expTotalDepositsHigh']] = data[['expValCashDepositsLow','expValCashDepositsHigh','expTotalDepositsLow','expTotalDepositsHigh']].apply(pd.to_numeric)

      
      data['beneficiaryCountry'].fillna('India', inplace = True)
                                        
      all_data=data.columns
      numeric=data._get_numeric_data().columns
      categorical=list(set(all_data)-set(numeric))
      
      
#       df_with_dummies = pd.get_dummies(data, prefix='Category_', columns=categorical)
#       cols=df_with_dummies.columns
      
      cols_need_mapped = categorical

      for c in cols_need_mapped :
          data[c] = data[c].map(mapper[c])

      sc = StandardScaler()
      X =  pd.DataFrame(sc.fit_transform(data),columns=data.columns)
      
      return X

In [0]:
## new data which could be made as a dictionary or loaded from a json file
data={
    'CUSTNO':46678,
    'Cr.Funds.Transfer.total':0, 
    'Cr.Payments.total':22222,
    'Dr.Funds.Transfer.total':0, 
    'Cr.Funds.Transfer.count':0,
    'Cr.Payments.count':1, 
    'Dr.Funds.Transfer.count':0, 
    'Dr.Payments.count':2,
    'CUSTNO.1':45879,
    'CUSTTYPE':1, 
    'ACCTNO':123456789, 
    'TYPEOFBUSINESS':'IT', 
    'COMPANYCAT':'Private',
    'countryIncorporation':'India', 
    'countryTaxResidency':'India', 
    'addressBusinessCountry':'India',
    'expValCashDeposits':'0-9000', 
    'expTotalDeposits':'0-33000', 
    'STATUS':'Active', 
    'AVAILBALANCE':460281.8224,
    'prodCategory':'Loan', 
    'prodType':'Working Capital', 
    'productRisk':'Low', 
    'TXDATE_year':2015,
    'CASHFLOWTYPE':'Dr', 
    'TXAMT':3187,
    'beneficiaryCountry':'India', 
    'txnChannel':'Internet', 
    'txnMode':'Cash'
    }

In [83]:
# Load model
risk_model = joblib.load(joblib_file)

#make prediction using model 
new_data=pd.DataFrame(data,index=[0])
processed_data=preprocess(new_data,mapper)
riskRating=risk_model.predict(processed_data)[0]
print("RiskRating for this Transaction is:",riskRating)

RiskRating for this Transaction is: 1
