#Important Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

# Read Data

In [None]:
df = pd.read_csv('diabetes.csv')

In [None]:
df

#Explore Data in Depth

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.count()

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.head(10)

In [None]:
df['BMI'].head()

In [None]:
df.tail()

In [None]:
df.tail(10)

In [None]:
df.sample()

In [None]:
df.sample(5)

In [None]:
df[10:20]

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe().T

In [None]:
df['Pregnancies'].value_counts()

In [None]:
df.groupby('Outcome').size()

#Visualization of Data

In [None]:
df.hist(figsize = (20,20))

In [None]:
sn.pairplot(df, hue = 'Outcome')

In [None]:
df.corr()

In [None]:
sn.heatmap(df.corr())

In [None]:
sn.countplot(x='Outcome',data=df)
plt.show()

In [None]:
sn.boxplot(x='BMI',data=df)
plt.show()

In [None]:
sn.scatterplot(x='BMI', y='Age',data=df)
plt.show()

In [None]:
sn.distplot(df['Pregnancies'])

In [None]:
sn.jointplot(x='BMI',y='Pregnancies',data=df)

In [None]:
sn.kdeplot(df['BMI'])

#Data Cleaning

In [None]:
df.describe()

Can minimum value of below listed columns be zero (0)?

On these columns, a value of zero does not make sense and thus indicates missing value.

Following columns or variables have an invalid zero value:

#Glucose
#BloodPressure
#SkinThickness
#Insulin
#BMI

#It is better to replace zeros with nan since after that counting them would be easier and zeros need to be replaced with suitable values

In [None]:
df_copy = df(deep = True)
df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
df_copy


In [None]:
df_copy.isnull().sum()

In [None]:
df_copy.describe()

In [None]:
df_copy.info()

In [None]:
df_copy['Glucose'].fillna(df_copy['Glucose'].mean(), inplace = True)

In [None]:
df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(), inplace = True)

In [None]:
df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].median(), inplace = True)

In [None]:
df_copy['Insulin'].fillna(df_copy['Insulin'].median(), inplace = True)

In [None]:
df_copy['BMI'].fillna(df_copy['BMI'].median(), inplace = True)

In [None]:
df_copy.info()

If there is a dataset that have great outliers,
I'll prefer median.
E.x.: 99% of household income is below 100, and 1% is above 500.

On the other hand,
if we work with wear of clothes that customers give to dry-cleaner
(assuming that dry-cleaners' operators fill this field intuitively),
I'll fill missings with mean value of wear.

In [None]:
sn.boxplot(x='BMI',data=df_copy)
plt.show()


In [None]:
df_copy.describe()

#Scaling the data
#Outlier!!! The Silent Killer

In [None]:
df_copy.columns

In [None]:
sn.boxplot(x='Outcome', y='BMI',data=df_copy)
plt.show()

In [None]:
sn.boxplot(x='Outcome', y='Insulin',data=df_copy)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(df_copy.drop('Outcome',axis=1))

In [None]:
scaled_features = scaler.transform(df.drop('Outcome',axis=1))

In [None]:
scaled_features

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

In [None]:
df_feat['Outcome']= df['Outcome']

In [None]:
df_feat.head()

In [None]:
sn.boxplot(x='Outcome',y='Insulin',data=df_feat)
plt.show()

In [None]:
sn.boxplot(x='Outcome', y='BMI',data=df_feat)
plt.show()

# Decide Dependant and Independant Elements in Data Set

In [None]:
x=df_feat.drop('Outcome',axis=1)

In [None]:
x

In [None]:
y=df_feat['Outcome']

In [None]:
y

#Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test =train_test_split(x,y,test_size=0.3)

In [None]:
X_train.info()

In [None]:
X_test.info()

#Machine Learning Models

1. Linear
2. Logistic
3. SVC
4. KNN
5. K Mean
6. Desicion Tree
7. Random Forest
8. Naive Bayes

#Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

#SMV

In [None]:
from sklearn.svm import SVC

In [None]:
SVM = SVC()

In [None]:
SVM.fit(X_train,y_train)

In [None]:
predictions = SVM.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

#KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train,y_train)

In [None]:
predictions = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

#K Mean

In [None]:
from sklearn.cluster import KMeans

In [None]:
Kmean = KMeans(n_clusters=2)

In [None]:
Kmean.fit(X_train,y_train)

In [None]:
predictions = Kmean.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

#Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
predictions = dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

#Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

#Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
 NB= GaussianNB()

In [None]:
NB.fit(X_train,y_train)

In [None]:
predictions = NB.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))

#Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

# cannot apply Linear Regression to this Data

#Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
LDA = LinearDiscriminantAnalysis()

In [None]:
LDA.fit(X_train,y_train)

In [None]:
predictions = LDA.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,predictions))