#  Machine Learning Assignment 1

## Question 1

#### Group Name : ML_GROUP113 Ban_2

#### RAJABOINA SIVARAJA   2018AB04505
#### POLEPEDDI L V SAINADHA RAKESH  2018AB04592
#### MANU JOY  2018AB04568
#### NIPUN PATHAK  2018AB04627

In [None]:
from  sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd
import numpy as np
import seaborn as sns

### Loading the data

In [None]:
df = pd.read_csv('diabetes.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### There are no null values in the dataset. However let's verify if there are any zero's 

In [None]:
(df == 0).astype(int).sum(axis=0)

### Extracting the X and Y  values

In [None]:
features = list(df.columns)[:-1]
X = df[features] 
y = df['Outcome']

### Visualizing the dataset

#### Ploting the graph by grouping with outcome 

In [None]:
df.groupby('Outcome').hist(figsize=(12, 12))

### Correlation Matrix

In [None]:
corr = df.corr()
corr

In [None]:
sns.heatmap(df, annot = True)

In [None]:
sns.countplot(x='Outcome',data=df)

##### Changing the Zero values in Columns other than Pregnancy and Outcome as the other columns can not be Zero. Substituting the zero's with mean of the respective columns

In [None]:
df[features[1:]] = df[features[1:]].replace(0, np.NaN)
X = df[features] 
y = df['Outcome']
transform_X = X.copy()
imputer = SimpleImputer(missing_values=np.NaN, strategy = 'mean')
transform_X [features[1:]] = imputer.fit_transform(transform_X[features[1:]].values)

In [None]:
transform_X.isnull().sum()

### Splitting the data into test and train split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(transform_X, y, test_size=0.2, random_state=0)

In [None]:
lr_model = LogisticRegression()

### Training and testing the data

In [None]:
lr_model.fit(X_train, y_train)
predict = lr_model.predict(X_test)
probs = lr_model.predict_proba(X_train)
print('The result for prediction is: ',predict)

In [None]:
score = lr_model.score(X_test, y_test, predict)

In [None]:
print('The accuracy is ', score * 100)

### Invoking the 10 fold cross validation

In [None]:
kfold = KFold(n_splits=10)
result = cross_val_score(lr_model, X_train, y_train, cv=kfold, scoring='accuracy')

In [None]:
print('The result of 10 fold cross validation is: ',result)
print('The mean result of 10 fold cross validation is: ',result.mean())

### In logistic regression, the dependent variable is binary or dichotomous, i.e. it only contains data coded as 1 or 0 .

#### The goal of logistic regression is to find the best fitting (yet biologically reasonable) model to describe the relationship between the dichotomous characteristic of interest (dependent variable = response or outcome variable) and a set of independent (predictor or explanatory) variables. Logistic regression generates the coefficients (and its standard errors and significance levels) of a formula to predict a logit transformation of the probability of presence of the characteristic of interest:

####  logit (p) = b0 + b1x1 + b2x2 + b3x3 + ....... + bkxk

#### where p is the probability of presence of the characteristic of interest. The logit transformation is defined as the logged odds:

#### Odds=p/(1-p) and Logit(p)=ln(p/(1-p))


#### Confusion matrix and accuracy

In [None]:
print('The accuracy is ', score * 100)
print("Train confusion matrix")
cf = confusion_matrix(y_test, predict)
con_df = pd.DataFrame(cf, index=['Actual:NO','Actual:YES'], columns=['Predicted:NO','Predicted:YES'])
sns.heatmap(con_df,annot=True, fmt=".1f")

In [None]:
print(classification_report(y_test,predict))

#### Coef's for the features:

In [None]:
lr_model.coef_.T

In [None]:
colors = ['red' if(i == 1) else 'green' for i in y_train]

### Decision Boundary

In [None]:
plt.rcParams["figure.figsize"] = (10,10)
plt.scatter(x= range(0,probs.shape[0]),y = probs[:,1] ,color = colors)
plt.axhline(.5, color='black')
plt.xlabel('data point')
plt.ylabel('Probability')
plt.title('# Decision boundary')
red_patch = mpatches.Patch(color='red', label='Diabetic')
g_patch = mpatches.Patch(color='green', label='Not Diabetic')
plt.legend(handles=[red_patch,g_patch],loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=True, ncol=2)
plt.show()

#### Feature Extraction using RFE (Recursive Feature Elimination)

In [None]:
rfecv = RFECV(estimator=lr_model, step=1, cv=kfold, scoring='accuracy')
rfecv.fit(transform_X, y)

In [None]:
plt.figure()
plt.title('Logistic Regression CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
feature_importance = list(zip(features, rfecv.support_))
print('Important features are:',feature_importance)

In [None]:
imp_features = []
for key,val in feature_importance:
    if val: imp_features.append(key)
print('Features selected are', imp_features)

In [None]:
X_fe_train = X_train[imp_features]
X_fe_test = X_test[imp_features]
lr_model.fit(X_fe_train, y_train)
predict = lr_model.predict(X_fe_test)
probs = lr_model.predict_proba(X_fe_train)

In [None]:
print('The result after prediction is: ',predict)

In [None]:
score = lr_model.score(X_fe_test, y_test, predict)
print('The accuracy after feature engineering is ', score * 100)

In [None]:
result = cross_val_score(lr_model, X_fe_train, y_train, cv=kfold, scoring='accuracy')

In [None]:
print('The results after feature engineering: ',result)
print('The result mean after feature engineering: ',result.mean())

#### Confusion Matrix and accuracy

In [None]:
print('The accuracy is ', score * 100)
print("Train confusion matrix")
cf = confusion_matrix(y_test, predict)
con_df = pd.DataFrame(cf, index=['Actual:NO','Actual:YES'], columns=['Predicted:NO','Predicted:YES'])
sns.heatmap(con_df,annot=True, fmt=".1f")

In [None]:
print(classification_report(y_test,predict))

### Decision Boundary

In [None]:
plt.rcParams["figure.figsize"] = (10,10)
plt.scatter(x= range(0,probs.shape[0]),y = probs[:,1] ,color = colors)
plt.axhline(.5, color='black')
plt.xlabel('data point')
plt.ylabel('Probability')
plt.title('# Decision boundary')
red_patch = mpatches.Patch(color='red', label='Diabetic')
g_patch = mpatches.Patch(color='green', label='Not Diabetic')
plt.legend(handles=[red_patch,g_patch],loc='upper center', bbox_to_anchor=(0.5, -0.05), shadow=True, ncol=2)
plt.show()