In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import pandas as pd

# Data Load

In [7]:
df_raw = pd.read_csv("../data/fraud_oracle.csv")
df = df_raw.copy()

# Basic EDA

## Each Column View

In [13]:
df.shape

(15420, 33)

In [17]:
df.index

RangeIndex(start=0, stop=15420, step=1)

In [14]:
df.columns

Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber',
       'Deductible', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year',
       'BasePolicy'],
      dtype='object')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [22]:
for i in df.columns:
    print(i)
    print(df[i].unique())
    print(f'nunique : {df[i].nunique()}')
    print(df[i].isnull().sum())
    print(df[i].dtype)
    print("")

Month
['Dec' 'Jan' 'Oct' 'Jun' 'Feb' 'Nov' 'Apr' 'Mar' 'Aug' 'Jul' 'May' 'Sep']
nunique : 12
0
object

WeekOfMonth
[5 3 2 4 1]
nunique : 5
0
int64

DayOfWeek
['Wednesday' 'Friday' 'Saturday' 'Monday' 'Tuesday' 'Sunday' 'Thursday']
nunique : 7
0
object

Make
['Honda' 'Toyota' 'Ford' 'Mazda' 'Chevrolet' 'Pontiac' 'Accura' 'Dodge'
 'Mercury' 'Jaguar' 'Nisson' 'VW' 'Saab' 'Saturn' 'Porche' 'BMW' 'Mecedes'
 'Ferrari' 'Lexus']
nunique : 19
0
object

AccidentArea
['Urban' 'Rural']
nunique : 2
0
object

DayOfWeekClaimed
['Tuesday' 'Monday' 'Thursday' 'Friday' 'Wednesday' 'Saturday' 'Sunday'
 '0']
nunique : 8
0
object

MonthClaimed
['Jan' 'Nov' 'Jul' 'Feb' 'Mar' 'Dec' 'Apr' 'Aug' 'May' 'Jun' 'Sep' 'Oct'
 '0']
nunique : 13
0
object

WeekOfMonthClaimed
[1 4 2 3 5]
nunique : 5
0
int64

Sex
['Female' 'Male']
nunique : 2
0
object

MaritalStatus
['Single' 'Married' 'Widow' 'Divorced']
nunique : 4
0
object

Age
[21 34 47 65 27 20 36  0 30 42 71 52 28 61 38 41 32 40 63 31 45 60 39 55
 35 44 72 29 37 59

자료형은 다음 다섯 가지로 나눈다.

* conti
* count
* ord
* mul
* bin

이외, id type, FK type 등 생각하자. 상황 딸...


그 외, 데이터의 저장 형식은 다양하게 있겠지. 
int64 등... 둘 다 구분해 쓰기. 

* Month/WeekOfMonth/DayOfWeek : Jan~Dec, 1~5, Sunday~Saturday
* Make : Maker. categorical-Multinomial
* AccidentArea : 도시/시골. categorical-binary
* MonthClaimed/DayOfWeekClaimed/WeekOfMonthClaimed : 요일. 특이하게 0이 존재. womc에는 없음. 
* sex : 생물학적 성. categorical-binary
* MartialStatus : 결혼 상태. cat-bin
* Fault : 피해자 존재시 fault. third party면 제3자인 의미. categorical-bin
* Age : 나이. conti.
* PolicyType : 잘 모르겠음. 의미 공부해 볼 필요 있음. cat-multi. 두 가지 분류로 잘라서 정규화할 필요도 있을 듯?
* VehiclePrice : 가격. conti가 ordered data로 정리되어 있음. 
* FraudFound : 1이면 사기, 0이면 사기 아님. 
* PolicyNumber : id
* RepNumber : 잘 모르겠음 
* Deductible : 잘 모름
* DriverRating : 운전자 평가. cat-mul
* Days_Policy_Accident : 사고날 총 사고 수? cat-ord
* Days_Policy_Claim : ? cat-ord
* PastNumberOfClaims : 보험자의 과거 청구 건수. cat-ord
* AgeOfVehicle : 차 나이. cat-ord
* AgeOfPolicyHolder : 연령. cat-ord
* PoliceReportFiled : 경찰 연락 여부. cat-bin
* WitnessPresent : 증인 존재 여부. bin.
* AgentType: bin.
* NumberOfSuppliments : ord.
* AddressChange_Claim : 가입-청구 기간. cat.
* NumberOfCars : 가진 차의 수. ord.
* Year : 발생연도. 1994-1996.
* BasePolicy : 계약 유형. 모두 보장? 등등. cat. 

## Basic Skim

In [10]:
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [18]:
df.describe()

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,FraudFound_P,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0
mean,2.788586,2.693969,39.855707,0.059857,7710.5,8.483268,407.70428,2.487808,1994.866472
std,1.287585,1.259115,13.492377,0.23723,4451.514911,4.599948,43.950998,1.119453,0.803313
min,1.0,1.0,0.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,0.0,3855.75,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,0.0,7710.5,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,0.0,11565.25,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,1.0,15420.0,16.0,700.0,4.0,1996.0


In [19]:
df.value_counts()

Month  WeekOfMonth  DayOfWeek  Make     AccidentArea  DayOfWeekClaimed  MonthClaimed  WeekOfMonthClaimed  Sex     MaritalStatus  Age  Fault          PolicyType            VehicleCategory  VehiclePrice     FraudFound_P  PolicyNumber  RepNumber  Deductible  DriverRating  Days_Policy_Accident  Days_Policy_Claim  PastNumberOfClaims  AgeOfVehicle  AgeOfPolicyHolder  PoliceReportFiled  WitnessPresent  AgentType  NumberOfSuppliments  AddressChange_Claim  NumberOfCars  Year  BasePolicy
Apr    1            Friday     Honda    Rural         Monday            Apr           1                   Female  Married        33   Third Party    Sedan - Collision     Sedan            20000 to 29000   0             12276         16         400         3             more than 30          more than 30       1                   5 years       31 to 35           No                 No              External   none                 no change            1 vehicle     1996  Collision     1
May    1            Friday   