In [0]:
%pip install suds
%pip install googletrans==3.1.0a0


Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
from suds.client import Client
from pyspark.sql.types import *
from pyspark.sql.functions import col, expr,when,concat,lit,sum
from googletrans import Translator, constants
from pprint import pprint

<h3> Turkish Identification Number Check

In [0]:
def verifyTCID(customer):
    wsdl = "https://tckimlik.nvi.gov.tr/Service/KPSPublic.asmx?op=TCKimlikNoDogrula&wsdl"
    client = Client(wsdl)
    result = client.service.TCKimlikNoDogrula(int(customer.id), customer.name, customer.surname, customer.birthdate[4:8])
    return result

In [0]:
def checkSimilarity(text, test):
    match = 0
    wordCount = 0   
    def calculateSimilarity(word1, word2):
        temp = 0
        wordCount = 0
  
        if (len(word1) > len(word2)):
            for i in range(0,len(word2)):              
                if word1[i] == word2[i]: temp += 1
        else:
            for i in range(0,len(word1)):
                if word1[i] == word2[i]: temp += 1
        return temp
            
    for i in range(0,len(text)):
        note = calculateSimilarity(text[i],test[i])
        match += note
        wordCount += ((len(text) + len(test))/2)
    
    idCheck = calculateSimilarity(text[0],test[0])
    if(idCheck/len(text[0]) == 1):
        return 100
    else:
        riskNote = '{:.0f}'.format((match/wordCount)*100)  
        return int(riskNote)
     
    

In [0]:
def isValidTCID(value):
    value = str(value)
    
    if not len(value) == 11:
        return False
    
    if not value.isdigit():
        return False
    
    if int(value[0]) == 0:
        return False
    
    digits = [int(d) for d in str(value)]
    
    digitsSum = 0 
    for i in digits[:10]:
        digitsSum+=i
    
    if not digitsSum % 10 == digits[10]:
        return False
    
    digitsSum2 = (digits[0]+digits[2]+digits[4]+digits[6]+digits[8])*7
    digitsSum3 = digits[1]+digits[3]+digits[5]+digits[7]
    
    if not (((digitsSum2 - digitsSum3)) % 10) == digits[9]:
        return False
    
    return True

In [0]:
schema = (StructType([
            StructField("ID",StringType(),False),
            StructField("Name",StringType(), False),
            StructField("Surname",StringType() , False),
            StructField("City" ,StringType(), False),
            StructField("Phone" ,StringType(),  False),
            StructField("Birthdate" ,StringType() , False),
            StructField("Origin", StringType(), False)
]))

In [0]:
data = spark.read.format("csv").option("schema",schema).option("header","true").load("dbfs:/FileStore/customer__1_.csv")
data.show(truncate = False , n = 5)

+-----------+------+-------+-------+----------+---------+------+
|ID         |Name  |Surname|City   |Phone     |Birthdate|Origin|
+-----------+------+-------+-------+----------+---------+------+
|66138141171|Murat |AK     |Ardesen|1235481257|15011995 |TR    |
|25727912644|Keriz |L      |Bilecik|1235012248|15011995 |TR    |
|53783768355|Onur  |D      |Bilecik|7412541596|28101999 |TR    |
|73665332727|Sadik |N      |Hatay  |1235481257|20101999 |TR    |
|16209955171|Berfin|N      |Usak   |5052424540|12032005 |TR    |
+-----------+------+-------+-------+----------+---------+------+
only showing top 5 rows



In [0]:
data.createOrReplaceTempView("customer")

In [0]:
df = data.rdd.collect()

<h3>ID Verification

In [0]:
class Customer:
      def __init__(self, id, name,surname,country,phoneNumber,birthdate,origin):
        self.id = id
        self.name = name
        self.surname = surname
        self.country= country
        self.phoneNumber = phoneNumber
        self.birthdate = birthdate
        self.origin = origin


In [0]:
def verifyCustomer(customer):
    
    if(customer.origin == "TR"):
        if(isValidTCID(testCustomer.id) == False):
            raise Exception("Identification Number is NOT VALID")
        else:
            if(verifyTCID(testCustomer) == False):
                raise Exception("Customer Information is NOT VALID")
            
    elif(customer.origin == "AR"):
        origin_name = customer.name
        translationName = translator.translate(origin_name)
        translated_name = translationName.text       
        customer.name = translated_name
        
        origin_surname = customer.surname
        translationSurname = translator.translate(origin_surname)
        translated_surname = translationSurname.text       
        customer.surname = translated_surname
        
    elif(customer.origin == "RU"):
        origin_name = customer.name
        translationName = translator.translate(origin_name)
        translated_name = translationName.text       
        customer.name = translated_name
        
        origin_surname = customer.surname
        translationSurname = translator.translate(origin_surname)
        translated_surname = translationSurname.text       
        customer.surname = translated_surname
    
    return customer
        
              
        

In [0]:
verified_customer = verifyCustomer(testCustomer)

<h3> Checking Similarities with Customers

In [0]:
similarityList = list()
testData = [verified_customer.id,verified_customer.name,verified_customer.surname,verified_customer.country,verified_customer.phoneNumber,verified_customer.birthdate]
for row in df:
    temp = [verified_customer.id,\
            row['ID'],\
            checkSimilarity(testData,row)]
    similarityList.append(temp)

    

In [0]:
similarity_result_schema = (StructType([
            StructField("NewCustomer_ID",StringType(),False),
            StructField("OldCustomer_ID",StringType(), False),
            StructField("Similarity", IntegerType() , False),

]))

In [0]:
rdd = spark.sparkContext.parallelize(similarityList)
similarity_result_df = spark.createDataFrame(rdd,similarity_result_schema)
similarity_result_df.createOrReplaceTempView("SimilarityResult")

In [0]:
spark.sql("SELECT * FROM SimilarityResult \
           WHERE Similarity >\
           (SELECT AVG(Similarity) FROM SimilarityResult)*2 \
           ORDER BY Similarity DESC\
           LIMIT 10").show()

+--------------+--------------+----------+
|NewCustomer_ID|OldCustomer_ID|Similarity|
+--------------+--------------+----------+
|   26536894968|   61244638969|        62|
|   26536894968|   49829631862|        56|
|   26536894968|   18576971572|        56|
|   26536894968|   48554848938|        56|
|   26536894968|   35696328729|        56|
|   26536894968|   45961703532|        54|
|   26536894968|   12533265561|        54|
|   26536894968|   85020621679|        54|
|   26536894968|   87364754480|        54|
|   26536894968|   50065932124|        54|
+--------------+--------------+----------+



In [0]:
blacklist = spark.sql("SELECT OldCustomer_ID, Similarity FROM SimilarityResult  WHERE Similarity > 75")

In [0]:
blacklist.show()

+--------------+----------+
|OldCustomer_ID|Similarity|
+--------------+----------+
+--------------+----------+



In [0]:
customer_df =spark.sql("SELECT * FROM SimilarityResult ORDER BY Similarity DESC LIMIT 1")
topCustomer = customer_df.select(col("Similarity")).collect()
if(topCustomer[0][0] > 75):
    raise Exception('Suspicious customer')

<h3>Checking Similarities with Criminals

In [0]:
criminal_data = spark.read.format("csv").option("schema",schema).option("header","true").load("dbfs:/FileStore/criminals.csv")
criminal_data.createOrReplaceTempView("CriminalData")
criminal_df = criminal_data.rdd.collect()

In [0]:
criminal_similarityList = list()
criminalTestData = [verified_customer.id,verified_customer.name,verified_customer.surname,verified_customer.country,verified_customer.phoneNumber,verified_customer.birthdate]
for row in criminal_df:
    temp = [verified_customer.id,\
            row['ID'],\
            checkSimilarity(criminalTestData,row)] #Check İşlemi
    criminal_similarityList.append(temp)

In [0]:
criminal_rdd = spark.sparkContext.parallelize(criminal_similarityList)
criminal_similarity_result_df = spark.createDataFrame(criminal_rdd,similarity_result_schema)
criminal_similarity_result_df.createOrReplaceTempView("CriminalSimilarityResult")

In [0]:
spark.sql("SELECT * FROM CriminalSimilarityResult ORDER BY Similarity DESC LIMIT 10").show()

+--------------+--------------+----------+
|NewCustomer_ID|OldCustomer_ID|Similarity|
+--------------+--------------+----------+
|   26536894968|   39729672676|        46|
|   26536894968|   26559638058|        44|
|   26536894968|   22771831421|        41|
|   26536894968|   98955489188|        41|
|   26536894968|   84561321789|        38|
|   26536894968|   43017781760|        36|
|   26536894968|   13977399327|        36|
|   26536894968|   76528650897|        36|
|   26536894968|   47789457596|        33|
|   26536894968|   21806302923|        31|
+--------------+--------------+----------+



In [0]:
crime_df =spark.sql("SELECT * FROM CriminalSimilarityResult ORDER BY Similarity DESC LIMIT 1")
topCriminal = crime_df.select(col("Similarity")).collect()
if(topCriminal[0][0] > 75):
    raise Exception('It is not appropriate to open an account with the customer.')

<h6> Output Tests

In [0]:
testCustomerAR = Customer("26536894968","سامي","توجال","Rize","5350644305","28101999","AR")
origin_name_ar = testCustomerAR.name
origin_surname_ar = testCustomerAR.surname
translated_test_customer_ar = verifyCustomer(testCustomerAR)
print("Origin =>"+origin_name_ar + " -- Translated Version =>  "  +translated_test_customer_ar.name)
print("Origin =>"+origin_surname_ar + " -- Translated Version =>  "  +translated_test_customer_ar.surname)

Origin =>سامي -- Translated Version =>  Sami
Origin =>توجال -- Translated Version =>  tugal


In [0]:
testNotValidID = Customer("00000000000","Sami","Tuğal","Rize","5053265896","30101999","TR")
verifyTest = verifyCustomer(testNotValidID)

[0;31m---------------------------------------------------------------------------[0m
[0;31mException[0m                                 Traceback (most recent call last)
[0;32m<command-966537111543392>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0mtestNotValidID[0m [0;34m=[0m [0mCustomer[0m[0;34m([0m[0;34m"00000000000"[0m[0;34m,[0m[0;34m"Sami"[0m[0;34m,[0m[0;34m"Tuğal"[0m[0;34m,[0m[0;34m"Rize"[0m[0;34m,[0m[0;34m"5053265896"[0m[0;34m,[0m[0;34m"30101999"[0m[0;34m,[0m[0;34m"TR"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mverifyTest[0m [0;34m=[0m [0mverifyCustomer[0m[0;34m([0m[0mtestNotValidID[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m<command-966537111543370>[0m in [0;36mverifyCustomer[0;34m(customer)[0m
[1;32m      6[0m         [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      7[0m             [0;32mif[0m[0;34m([0m[0mverifyTCID[0m[0;34m([0m[0mtestCustomer[0m[0;34