# Logistic Regression


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Sigmoid function

In [None]:
def sigmoid(x):
    v=1/(1+np.exp(-x))
    print(v)
    return v

In [None]:
x = np.arange(-10,11)
y = sigmoid(x)

In [None]:
x

In [None]:
y

In [None]:
plt.plot(x,y)
plt.show()

In [None]:
df = pd.read_csv("insurance.csv")

In [None]:
df.head()

## Check data balance

In [None]:
df["insuranceclaim"].value_counts()

## Train test split

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

## Baseline model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log = LogisticRegression()

In [None]:
log.fit(X_train,y_train)

In [None]:
log.score(X_test,y_test)

In [None]:
log.coef_

In [None]:
y_pred = log.predict(X_test)
y_pred

## Evaluation metrics

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()

In [None]:
print(tp, fn)
print(fp, tn)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Accuracy means the fraction which is predicted correctly
# formula for the same is
# Accuracy = TP + TN / TP + TN + FP + FN
accuracy_score(y_test,y_pred)

In [None]:
# Precision is the fraction of predicted positive event 
# that are actually positive
# formula is
# Precision = TP / TP + FP
precision_score(y_test,y_pred)

In [None]:
# Recall is sensitivity, it is the fraction of positive
# events that are predicted correctly
# formula is
# Recall (Sensitivity) = TP / TP + FN
recall_score(y_test,y_pred)

In [None]:
# F1 score is the harmonic mean of recall and precision
# with a higher score as a better model.
# formula is
# F1 = 2 * (precision*recall) / precision + recall
f1_score(y_test,y_pred)

In [None]:
# ROC curves are VERY help with understanding the balance 
# between true-positive rate and false positive rates.
from sklearn.metrics import roc_curve

fpr, tpr, threshold = roc_curve(y_test, y_pred)

In [None]:
# fpr = the false positive rate (FP / (FP + TN)) for each threshold
fpr

In [None]:
# tpr = the true positive rate (TP / (TP + FN)) for each threshold
tpr

In [None]:
# thresholds = all unique prediction probabilities in descending order
threshold

In [None]:
# plotting the ROC curve
plt.plot(fpr, tpr, 'r-')
plt.plot([0,1],[0,1],'k-',label='random')
plt.plot([0,0,1,1],[0,1,1,1],'g-',label='perfect')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
# now we shall analize the performance using AUC
# (Area under curve metrics)
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test, y_pred)
print(auc)

In [None]:
print(classification_report(y_test,y_pred))

## Reducing skewness

In [None]:
X["charges"] = np.sqrt(X["charges"])

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [None]:
log2 = LogisticRegression()

In [None]:
log2.fit(X_train,y_train)

In [None]:
log2.score(X_test,y_test)

In [None]:
y_pred = log2.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()

In [None]:
print(tp, fn)
print(fp, tn)