## 1.Import Necessary Libraires

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score,roc_curve,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

## 2.Import Data

In [2]:
claimants = pd.read_csv('claimants.csv')
claimants.head()

Unnamed: 0,CASENUM,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,5,0,0.0,1.0,0.0,50.0,34.94
1,3,1,1.0,0.0,0.0,18.0,0.891
2,66,1,0.0,1.0,0.0,5.0,0.33
3,70,0,0.0,1.0,1.0,31.0,0.037
4,96,1,0.0,1.0,0.0,30.0,0.038


## 3.Data Understanding

In [3]:
claimants.shape

(1340, 7)

In [4]:
claimants.isna().sum()

CASENUM       0
ATTORNEY      0
CLMSEX       12
CLMINSUR     41
SEATBELT     48
CLMAGE      189
LOSS          0
dtype: int64

In [5]:
claimants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CASENUM   1340 non-null   int64  
 1   ATTORNEY  1340 non-null   int64  
 2   CLMSEX    1328 non-null   float64
 3   CLMINSUR  1299 non-null   float64
 4   SEATBELT  1292 non-null   float64
 5   CLMAGE    1151 non-null   float64
 6   LOSS      1340 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 73.4 KB


In [6]:
claimants.dtypes

CASENUM       int64
ATTORNEY      int64
CLMSEX      float64
CLMINSUR    float64
SEATBELT    float64
CLMAGE      float64
LOSS        float64
dtype: object

In [7]:
claimants.describe()

Unnamed: 0,CASENUM,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
count,1340.0,1340.0,1328.0,1299.0,1292.0,1151.0,1340.0
mean,11202.001493,0.488806,0.558735,0.907621,0.017028,28.414422,3.806307
std,9512.750796,0.500061,0.496725,0.289671,0.129425,20.304451,10.636903
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4177.0,0.0,0.0,1.0,0.0,9.0,0.4
50%,8756.5,0.0,1.0,1.0,0.0,30.0,1.0695
75%,15702.5,1.0,1.0,1.0,0.0,43.0,3.7815
max,34153.0,1.0,1.0,1.0,1.0,95.0,173.604


## 4.Data Preparation

In [8]:
del claimants['CASENUM']

In [9]:
claimants.head()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.94
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.33
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038


In [10]:
claimants.dropna(axis=0,inplace=True)

In [11]:
claimants.isna().sum()

ATTORNEY    0
CLMSEX      0
CLMINSUR    0
SEATBELT    0
CLMAGE      0
LOSS        0
dtype: int64

In [12]:
claimants.shape

(1096, 6)

In [13]:
claimants.head()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.94
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.33
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038


## 5.Model Building
- Two steps to approach
- 1.Separate Input and output
- 2.Model validation Techniques:
     - train Test Validation
     - KFOLD - CV
     - LOOCV

In [18]:
X = claimants.drop('ATTORNEY',axis=1)
y = claimants[['ATTORNEY']]

In [19]:
X_train,x_test,y_train,y_test =train_test_split(X,y,train_size=0.20,random_state=123)

In [20]:
X_train.shape,y_train.shape

((219, 5), (219, 1))

In [21]:
x_test.shape,y_test.shape

((877, 5), (877, 1))

## 6.Model Training

- If Attorney = 1, claimant is going to come with an Attorney
- If Attorney = 0,claminant is not going to come with an Attorney

In [22]:
logistic_model = LogisticRegression()

In [23]:
logistic_model=logistic_model.fit(X_train,y_train)

## 7.Model Testing | 8.Model Evaluation
### Train Data

In [27]:
y_pred_train=logistic_model.predict(X_train)

In [29]:
print(confusion_matrix(y_train,y_pred_train))

[[92 34]
 [29 64]]


In [30]:
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.76      0.73      0.74       126
           1       0.65      0.69      0.67        93

    accuracy                           0.71       219
   macro avg       0.71      0.71      0.71       219
weighted avg       0.71      0.71      0.71       219



### Test Data

In [31]:
y_pred_test =logistic_model.predict(x_test)

In [32]:
print(confusion_matrix(y_test,y_pred_test))

[[337 115]
 [152 273]]


In [33]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.69      0.75      0.72       452
           1       0.70      0.64      0.67       425

    accuracy                           0.70       877
   macro avg       0.70      0.69      0.69       877
weighted avg       0.70      0.70      0.69       877



## 9.Model Deployment

In [36]:
 from pickle import dump

In [37]:
dump(logistic_model,open('file','wb'))

In [38]:
from pickle import load

In [40]:
loader=load(open('file','rb'))

In [42]:
y_pred =loader.predict(x_test)

## The End.