In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
path = '/Users/raisaurabh04/OneDrive/GreyAtom/Practice Dataset/credit_card_dataset.zip'

data = pd.read_csv(path)

X = data.iloc[ : , : -1]

y = data.iloc[ : , -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0, stratify=y)

model = LogisticRegression(random_state=0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [4]:
def metric_data(y_true, y_pred):
    '''Method takes expected and actual output as parameters and provides all the relevant classification metrices'''
    #accuracy_score,classification_report,f1_score,confusion_matrix
    acc_score = accuracy_score(y_true, y_pred)
    confusion_mat = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = confusion_mat.ravel()
    f1 = f1_score(y_true, y_pred)
    prec_score = precision_score(y_true, y_pred)
    rec_score = recall_score(y_true, y_pred)
    roc_score = roc_auc_score(y_true, y_pred)
    classifn_report = classification_report(y_true, y_pred)
    return acc_score, prec_score, rec_score, f1, confusion_mat

In [5]:
accuracy, precision, recall, f1, confusion_mat = metric_data(y_test, y_pred)

accuracy, precision, recall, f1, confusion_mat

(0.9993270327998361,
 0.8913043478260869,
 0.6949152542372882,
 0.780952380952381,
 array([[34113,     5],
        [   18,    41]]))

In [6]:
rus = RandomUnderSampler(random_state=0)

X_sample2, y_sample2 = rus.fit_sample(X_train, y_train)

rus_model = LogisticRegression(random_state=0)

rus_model.fit(X_sample2, y_sample2)

y_pred = rus_model.predict(X_test)

accuracy, precision, recall, f1, confusion_mat = metric_data(y_test, y_pred)

accuracy, precision, recall, f1, confusion_mat



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

(0.9639816250694911,
 0.0421875,
 0.9152542372881356,
 0.08065720687079911,
 array([[32892,  1226],
        [    5,    54]]))

In [7]:
cc = ClusterCentroids(random_state=0)

X_sample3, y_sample3 = cc.fit_sample(X_train, y_train)

model_cc = LogisticRegression(random_state=0)

model_cc.fit(X_sample3, y_sample3)

y_pred = model_cc.predict(X_test)

accuracy, precision, recall, f1, confusion_mat = metric_data(y_test, y_pred)

accuracy, precision, recall, f1, confusion_mat



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

(0.894870819557012,
 0.015625,
 0.9661016949152542,
 0.03075263015915835,
 array([[30527,  3591],
        [    2,    57]]))

In [8]:
tl = TomekLinks(random_state=0)

X_sample4, y_sample4 = tl.fit_sample(X_train, y_train)

model_tl = LogisticRegression(random_state=0)

model_tl.fit(X_sample4, y_sample4)

y_pred = model_tl.predict(X_test)

accuracy_tl, precision_tl, recall_tl, f1_tl, confusion_mat_tl = metric_data(y_test, y_pred)

accuracy_tl, precision_tl, recall_tl, f1_tl, confusion_mat_tl



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

(0.9994440705737777,
 0.8846153846153846,
 0.7796610169491526,
 0.8288288288288288,
 array([[34112,     6],
        [   13,    46]]))

In [9]:
ros = RandomOverSampler(random_state=0)

X_sample5, y_sample5 = ros.fit_sample(X_train, y_train)

model_ros = LogisticRegression(random_state=0)

model_ros.fit(X_sample5, y_sample5)

y_pred = model_ros.predict(X_test)

accuracy_ros, precision_ros, recall_ros, f1_ros, confusion_mat_ros = metric_data(y_test, y_pred)

accuracy_ros, precision_ros, recall_ros, f1_ros, confusion_mat_ros



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

(0.9688094332445797,
 0.0484304932735426,
 0.9152542372881356,
 0.0919931856899489,
 array([[33057,  1061],
        [    5,    54]]))

In [10]:
smote = SMOTE(random_state=0)

X_sample6, y_sample6 = smote.fit_sample(X_train, y_train)

model_smote = LogisticRegression(random_state=0)

model_smote.fit(X_sample6, y_sample6)

y_pred = model_smote.predict(X_test)

accuracy_smote, precision_smote, recall_smote, f1_smote, confusion_mat_smote = metric_data(y_test, y_pred)

accuracy_smote, precision_smote, recall_smote, f1_smote, confusion_mat_smote



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

(0.9888814114755538,
 0.12412177985948478,
 0.8983050847457628,
 0.21810699588477367,
 array([[33744,   374],
        [    6,    53]]))



# Assessment

- Load dataset using pandas read_csv api in variable df and give file path as path.

- Display first 5 columns of dataframe df.

- Print df.info

- Remove the $ and , from columns 'INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT' which are there in the dataframe

- Store all the features(independent values) in a variable called X

- Store the target variable CLAIM_FLAG (dependent value) in a variable called y

- Calculate the value counts of target variable and store it in variable count

- Split the dataframe into X_train,X_test,y_train,y_test using train_test_split() function. Use test_size = 0.3 and random_state = 6

In [11]:
path = '/Users/raisaurabh04/OneDrive/GreyAtom/Practice Dataset/car_insurance_dataset_smote.csv'

In [12]:
df = pd.read_csv(path)

df.head()

df.info()

df[['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT']] = df[['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT']].applymap(lambda x : str(x).replace('$', ''))

df[['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT']] = df[['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT']].applymap(lambda x : str(x).replace(',', ''))

df.head()

X = df.iloc[ : , : -1]

y = df.iloc[ : , -1]

count = y.value_counts()
count

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6) 

Unnamed: 0,ID,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,...,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG
0,63581743,0,60.0,0,11.0,"$67,349",No,$0,z_No,M,...,11,Minivan,yes,"$4,461",2,No,3,$0,18.0,0
1,132761049,0,43.0,0,11.0,"$91,449",No,"$257,252",z_No,M,...,1,Minivan,yes,$0,0,No,0,$0,1.0,0
2,921317019,0,48.0,0,11.0,"$52,881",No,$0,z_No,M,...,1,Van,yes,$0,0,No,2,$0,10.0,0
3,727598473,0,35.0,1,10.0,"$16,039",No,"$124,191",Yes,z_F,...,4,z_SUV,no,"$38,690",2,No,3,$0,10.0,0
4,450221861,0,51.0,0,14.0,,No,"$306,251",Yes,M,...,7,Minivan,yes,$0,0,No,0,$0,6.0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10302 entries, 0 to 10301
Data columns (total 25 columns):
ID            10302 non-null int64
KIDSDRIV      10302 non-null int64
AGE           10295 non-null float64
HOMEKIDS      10302 non-null int64
YOJ           9754 non-null float64
INCOME        9732 non-null object
PARENT1       10302 non-null object
HOME_VAL      9727 non-null object
MSTATUS       10302 non-null object
GENDER        10302 non-null object
EDUCATION     10302 non-null object
OCCUPATION    9637 non-null object
TRAVTIME      10302 non-null int64
CAR_USE       10302 non-null object
BLUEBOOK      10302 non-null object
TIF           10302 non-null int64
CAR_TYPE      10302 non-null object
RED_CAR       10302 non-null object
OLDCLAIM      10302 non-null object
CLM_FREQ      10302 non-null int64
REVOKED       10302 non-null object
MVR_PTS       10302 non-null int64
CLM_AMT       10302 non-null object
CAR_AGE       9663 non-null float64
CLAIM_FLAG    10302 non-null int64
d

Unnamed: 0,ID,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,...,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG
0,63581743,0,60.0,0,11.0,67349.0,No,0,z_No,M,...,11,Minivan,yes,4461,2,No,3,0,18.0,0
1,132761049,0,43.0,0,11.0,91449.0,No,257252,z_No,M,...,1,Minivan,yes,0,0,No,0,0,1.0,0
2,921317019,0,48.0,0,11.0,52881.0,No,0,z_No,M,...,1,Van,yes,0,0,No,2,0,10.0,0
3,727598473,0,35.0,1,10.0,16039.0,No,124191,Yes,z_F,...,4,z_SUV,no,38690,2,No,3,0,10.0,0
4,450221861,0,51.0,0,14.0,,No,306251,Yes,M,...,7,Minivan,yes,0,0,No,0,0,6.0,0


0    7556
1    2746
Name: CLAIM_FLAG, dtype: int64





- Convert the 'INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT' to floating type in X_train.
- Convert the 'INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT' to floating type in X_test.
- Check the null value for X_train
- Check the null value for X_test







In [13]:
num_obj_columns = ['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT']

for col in num_obj_columns:
    X_train[col] = X_train[col].astype('float')
    X_test[col] = X_test[col].astype('float')

X_train.info()

X_test.info()

X_train.isna().sum()

X_test.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7211 entries, 7660 to 2761
Data columns (total 24 columns):
ID            7211 non-null int64
KIDSDRIV      7211 non-null int64
AGE           7207 non-null float64
HOMEKIDS      7211 non-null int64
YOJ           6820 non-null float64
INCOME        6819 non-null float64
PARENT1       7211 non-null object
HOME_VAL      6812 non-null float64
MSTATUS       7211 non-null object
GENDER        7211 non-null object
EDUCATION     7211 non-null object
OCCUPATION    6746 non-null object
TRAVTIME      7211 non-null int64
CAR_USE       7211 non-null object
BLUEBOOK      7211 non-null float64
TIF           7211 non-null int64
CAR_TYPE      7211 non-null object
RED_CAR       7211 non-null object
OLDCLAIM      7211 non-null float64
CLM_FREQ      7211 non-null int64
REVOKED       7211 non-null object
MVR_PTS       7211 non-null int64
CLM_AMT       7211 non-null float64
CAR_AGE       6773 non-null float64
dtypes: float64(8), int64(7), object(9)
memory us

ID              0
KIDSDRIV        0
AGE             4
HOMEKIDS        0
YOJ           391
INCOME        392
PARENT1         0
HOME_VAL      399
MSTATUS         0
GENDER          0
EDUCATION       0
OCCUPATION    465
TRAVTIME        0
CAR_USE         0
BLUEBOOK        0
TIF             0
CAR_TYPE        0
RED_CAR         0
OLDCLAIM        0
CLM_FREQ        0
REVOKED         0
MVR_PTS         0
CLM_AMT         0
CAR_AGE       438
dtype: int64

ID              0
KIDSDRIV        0
AGE             3
HOMEKIDS        0
YOJ           157
INCOME        178
PARENT1         0
HOME_VAL      176
MSTATUS         0
GENDER          0
EDUCATION       0
OCCUPATION    200
TRAVTIME        0
CAR_USE         0
BLUEBOOK        0
TIF             0
CAR_TYPE        0
RED_CAR         0
OLDCLAIM        0
CLM_FREQ        0
REVOKED         0
MVR_PTS         0
CLM_AMT         0
CAR_AGE       201
dtype: int64

- Drop the rows from columns ['YOJ','OCCUPATION'] which contains the NaN values from X_train
- Drop the rows from columns ['YOJ','OCCUPATION'] which contains the NaN values from X_test
- Update the index of y_train with y_train[X_train.index] and store it in variable y_train
- Update the index of y_test with y_test[X_test.index] and store it in variable y_test
- For X_train , fill the missing values for columns AGE,CAR_AGE,INCOME and HOME_VAL with mean on X_train and use inplace = True.
- For X_test fill the missing values for columns AGE,CAR_AGE,INCOME and HOME_VAL with mean on X_train and use inplace = True



In [14]:
X_train.dropna(subset=['YOJ','OCCUPATION'], inplace=True)

X_test.dropna(subset=['YOJ','OCCUPATION'], inplace=True)

y_train = y_train[X_train.index]

y_test = y_test[X_test.index]

X_train.fillna(X_train.mean(), inplace=True)

X_test.fillna(X_test.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [15]:
columns = X_train.select_dtypes(exclude=np.number).columns

le = LabelEncoder()

for col in columns:
    X_train[col] = le.fit_transform(X_train[[col]])
    X_test[col] = le.transform(X_test[[col]])

X_train.head()
X_test.head()

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,ID,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,...,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE
7660,915078478,0,46.0,0,12.0,49657.0,0,139287.0,0,1,...,13420.0,1,5,0,12775.0,1,0,4,0.0,7.961262
3581,144269185,0,56.0,0,14.0,58247.0,0,190172.0,0,0,...,18150.0,1,4,0,1380.0,3,0,1,4281.0,10.0
5174,631762912,0,50.0,0,12.0,89157.0,0,257091.0,0,1,...,19700.0,1,5,0,3175.0,1,0,0,0.0,7.0
1012,509427828,1,42.0,2,9.0,44661.0,0,149448.641934,0,1,...,25750.0,1,5,0,7637.0,2,0,3,3753.0,9.0
7546,243463686,0,53.0,0,0.0,0.0,0,149448.641934,0,0,...,21470.0,6,0,0,4372.0,3,0,2,1441.0,5.0


Unnamed: 0,ID,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,...,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE
8307,340661905,0,46.0,0,7.0,4034.0,0,149987.712355,0,1,...,6500.0,13,5,0,0.0,0,0,0,0.0,7.0
683,318742437,0,56.0,0,15.0,134166.0,0,324905.0,0,1,...,20520.0,6,0,0,0.0,0,0,0,0.0,12.0
5086,51278489,0,58.0,0,13.0,88490.0,0,0.0,1,1,...,10350.0,11,5,0,2737.0,2,0,0,1036.0,10.0
1885,335071025,1,44.0,1,7.0,7113.0,0,0.0,0,1,...,9850.0,10,5,0,0.0,0,1,0,0.0,5.0
376,727995019,0,40.0,1,12.0,39600.0,1,193495.0,1,1,...,5460.0,1,3,0,4188.0,2,0,5,3259.0,1.0





- Instantiate a logistic regression model with LogisticRegression(), use Random_state = 6 and save it to a variable called 'model'.

- Fit the model on the training data X_train and y_train.

- Make predictions on the X_test features and save the results in a variable called 'y_pred'.

- Calculate the accuracy_score and store it in variable score

- Calculate the precision_score and store it in variable precision

In [16]:
model = LogisticRegression(random_state=6)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = accuracy_score(y_test, y_pred)
score
precision = precision_score(y_test, y_pred)
precision



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=6, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

0.7427113702623906

0.8571428571428571



- Instantiate a SMOTE with SMOTE(random state = 9) and save it to a variable called 'smote'.
- fit the sample on X_train , y_train and store it in variable X_train and y_train.
- Instantiate a StandardScaler with StandardScaler() and save it to a variable called 'scaler'.
- Fit and transform it on X_trainand store it in variable X_train.
- Transform it on X_test and store it in variable X_test

In [17]:
smote = SMOTE(random_state=9)

X_train, y_train = smote.fit_sample(X_train, y_train)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [18]:
model = LogisticRegression(random_state=6)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = accuracy_score(y_test, y_pred)
score
precision = precision_score(y_test, y_pred)
precision



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=6, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

0.9897959183673469

1.0