# Regression Assumptions & Logistic Regression Notebook

1. Outlier Detection and Removal
2. Multicollinearity 
3. Scaling 
4. Logistic Regression

### Import the libraries you need!


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
pd.set_option("display.max_columns",None) 
pd.set_option("display.max_rows",None) 

import warnings
warnings.filterwarnings("ignore")

from IPython.display import Image
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import LocalOutlierFactor
sns.set(style="darkgrid", palette="pastel", color_codes=True)
sns.set_context('talk')

from pathlib import Path
data_dir = Path('../input/images')

### Read the dataset

In [None]:
df_1 = pd.read_csv("../input/heart-disease-uci/heart.csv")
df_1.head()

#### 1. Show the scatter plot between chol and target

In [None]:
plt.scatter(df_1.chol,df_1.target)

<h1>I. Outlier Removal</h1>

This section is adapted from Suraj RP's excellent notebook on various outlier detection techniques. Visit his full notebook here: https://www.kaggle.com/rpsuraj/outlier-detection-techniques-simplified

## Exploratory Data Analysis

### Painting a picture of the data!

#### 0. What columns do we have in this dataset?

In [None]:
df_1.columns

#### 1. How many null values do we have in each column?

In [None]:
df_1.isnull().sum()


#### 2. What is the distribution in each column?

In [None]:
df_1.hist(figsize=(20,20));

#### 3. Check out the statistical summary of the dataframe.

In [None]:
df_1.describe()

#### Let's consider serum cholestoral in mg/dl column i.e. "chol" for our analysis.

#### 4. Plot a simple box plot to visualize the outliers.

In [None]:
sns.boxplot(y=df_1.chol);

#### From the above box plot, we can surely observe that there are outliers in it!

## Back to Preprocessing...

#### 1. Find the value at the 25th percentile. 

In [None]:
twenty_five=np.percentile(df_1.chol, 25)
print(twenty_five)

#### 2. Find the value at the 75th percentile.

In [None]:
seventy_five=np.percentile(df_1.chol, 75)
print(seventy_five)

#### 3. Find the IQR. 

In [None]:
iqr = np.subtract(*np.percentile(df_1.chol, [75, 25]))
print(iqr)

#### 4. Calculate the cut-off range.

In [None]:
cut_off=1.5*iqr
print(cut_off)

#### 5. Calculate the upper bound value using the cut-off range and the 75th percentile value.

In [None]:
upper=seventy_five+cut_off
print(upper)

#### 6. Calculate the lower bound value using the cut-off range and the 25th percentile value.

In [None]:
lower=twenty_five-cut_off
print(lower)

#### 7. What is the total number of outliers?

In [None]:
df_1[df_1.chol>upper].count()

### Visual representation:

In [None]:
plt.figure(figsize = (10,6));
sns.distplot(df_1.chol, kde=False)
plt.axvspan(xmin = lower,xmax= df_1.chol.min(),alpha=0.2, color='red');
plt.axvspan(xmin = upper,xmax= df_1.chol.max(),alpha=0.2, color='red');

#### Here the red zone represents the outlier zone! The records present in that zone are considered as outliers

### Remedial Measure:
#### 8. Remove the records which are above the upper bound value and records below the lower bound value!

In [None]:
df_new=df_1[(df_1["chol"]>lower) & (df_1["chol"]<upper)]
df_new.head()


# II. Multicollinearity 

### Back to Exploratory Data Analysis

#### 1. Find the correlation coefficients matrix.

In [None]:
corrCoe = np.corrcoef(df_new)
corrCoe

#### 2. Plot the correlation matrix as a heatmap.

In [None]:
sns.heatmap(corrCoe);

### How to deal with multicollinearity?

1. Using Principal Component Analysis to select the most important variables
2. Remove one of the columns that contribute to multicollinearity
3. Create interaction variables from the correlated variables

# III. Scaling

#### 1. Display the minimum and maximum of each column.

In [None]:
df_new.min()

In [None]:
df_new.max()

#### 2 Find the max of 'chol'

In [None]:
df_new.chol.max()

#### 3. Find the min of 'chol'

In [None]:
df_new.chol.min()

#### 4. Perform min-max scaling on 'chol'

In [None]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
minmax_scaled = minmax_scaler.fit_transform(df_new.chol.to_frame())
minmax_scaled


#### 5. Find the mean of 'age'

In [None]:
df_new.age.mean()

#### 6. Find the standard deviation of 'age'

In [None]:
 df_new.age.std()

#### 7. Verify the mean and standard deviation by showing the distribution of 'age.'

In [None]:
df_new.age.hist();

#### 8. Perform Z-standardization on 'age.'

In [None]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
standard_scaled = standard_scaler.fit_transform(df_new.age.to_frame())
standard_scaled

# IV. Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
import scipy

#### 1. Create a logistic regression model

In [None]:
model= LogisticRegression(random_state=0)

#### 2. Set X to be the cholestrol column and Y to be the target column.

In [None]:
x=df_1.chol
y=df_1.target

#### 3. Perform train-test-splot by 20%

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

#### 4. Fit the model to the train data.

In [None]:
model.fit(x_train.to_frame(),y_train)

In [None]:
print(model.intercept_, model.coef_)

In [None]:
def plot_log_reg(x, y, data, clf, xmin=None, xmax=None, alpha=1, ax=None):
    plt.rcParams["figure.figsize"] = [10,10]
    if ax is None:
        fig, ax = plt.subplots()
    else:
        fig = ax.figure
    ax.scatter(data[x], data[y], color='black', zorder=20, alpha=alpha)
    if xmin is None:
        xmin = x.min()
    if xmax is None:
        xmax = x.max()
    X_test = np.linspace(xmin, xmax, 300)

    loss = scipy.special.expit(X_test * clf.coef_ + clf.intercept_).ravel()
    ax.plot(X_test, loss, linewidth=3)

    ax.set_xlabel(x)
    ax.set_ylabel(y)
    fig.tight_layout()
    sns.despine()
    return fig, ax

In [None]:
plot_log_reg('chol', 'target', df_1[['chol','target']], clf = model, xmin = 100, xmax = 350);
