In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import chardet
with open("/kaggle/input/diabetes-dataset/diabetes2.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(2000))

# check what the character encoding might be
print(result)

In [None]:
df = pd.read_csv('/kaggle/input/diabetes-dataset/diabetes2.csv', encoding =  'ascii')
df.head(11)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.shape

To remove any duplicates in the dataset

In [None]:
df.drop_duplicates(subset=None, keep='first', inplace=True)

In [None]:
df.shape # No duplicates

In [None]:
df.describe()

In [None]:
df.Outcome.value_counts().plot(kind='bar')
plt.show()

In [None]:
df.groupby('Outcome').mean() # to check the average values under each column for diabetic and non diabetic

**Check out the data in Glucose Column**

****Check out for diabatic person****

In [None]:
df[df.Outcome == 1]

**We have to remove the Outliers else it would affect the output, As it is observed that women who are diabatic have an mean glucose of 140+ this indicates that the glucose is high for diabitic patients and hence it cannot be 0. And this data should be removed**

In [None]:
plt.boxplot(df[df.Outcome == 1].Glucose)
plt.show()
df[((df.Outcome == 1) & (df.Glucose == 0))]

In [None]:
df = df[~((df.Outcome == 1) & (df.Glucose == 0))]

plt.boxplot(df[df.Outcome == 1].Glucose)
plt.show()
df.shape

**Check out for non-diabatic person**

In [None]:
df[(df.Outcome == 0) ]

In [None]:
plt.boxplot(df[df.Outcome == 0].Glucose)
plt.show()

In [None]:
df[df.Outcome == 0].Glucose.describe()

In [None]:
Q1 = df[df.Outcome == 0].Glucose.quantile(0.25)
Q2 = df[df.Outcome == 0].Glucose.quantile(0.75)
IQR = Q2 - Q1

df = df[~(((df.Glucose < (Q1 - 1.5 * IQR)) |(df.Glucose > (Q2 + 1.5 * IQR))) & (df.Outcome == 0))]


In [None]:
plt.boxplot(df[df.Outcome == 0].Glucose)
plt.show()

df[df.Outcome == 0].Glucose.describe()

**Check out the data in BloodPressure Column**


**Check out for diabatic women**

In [None]:
df[df.Outcome == 1].BloodPressure.describe()

In [None]:
plt.boxplot(df[df.Outcome == 1].BloodPressure)
plt.show()

In [None]:
Q1 = df[df.Outcome == 1].BloodPressure.quantile(0.25)
Q2 = df[df.Outcome == 1].BloodPressure.quantile(0.75)
IQR = Q2 - Q1

df = df[~(((df.BloodPressure < (Q1 - 1.5 * IQR)) |(df.BloodPressure > (Q2 + 1.5 * IQR))) & (df.Outcome == 1))]


In [None]:
plt.boxplot(df[df.Outcome == 1].BloodPressure)
plt.show()

In [None]:
print(df[df.Outcome == 1].BloodPressure.describe())
print(df.shape)

**Check out for non-diabatic women**

In [None]:
df[df.Outcome == 0].BloodPressure.describe()

In [None]:
plt.boxplot(df[df.Outcome == 0].BloodPressure)
plt.show()

In [None]:
Q1 = df[df.Outcome == 0].BloodPressure.quantile(0.25)
Q2 = df[df.Outcome == 0].BloodPressure.quantile(0.75)
IQR = Q2 - Q1

df = df[~(((df.BloodPressure < (Q1 - 1.5 * IQR)) |(df.BloodPressure > (Q2 + 1.5 * IQR))) & (df.Outcome == 0))]


In [None]:
plt.boxplot(df[df.Outcome == 0].BloodPressure)
plt.show()

In [None]:
df[df.Outcome == 0].BloodPressure.describe()

In [None]:
df.head(10)

**Separating the dataset**

In [None]:
X = df.drop(columns = 'Outcome', axis = 1)
Y = df.Outcome

print(X,Y)


**Data Standardization**

In [None]:
scaler = StandardScaler()
Standard_data  = scaler.fit_transform(X)

In [None]:
X = Standard_data
print(X)
print(Y)

***Train Test Split***

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2,stratify = Y, random_state = 1)


In [None]:
print(X.shape, X_train.shape, X_test.shape)

**Train the Model**

**(1) SVM Classifier**

In [None]:
classifier1 = svm.SVC(kernel='linear')
classifier1.fit(X_train, Y_train)

**Model Evaluation**

**Determining Accuracy Score**

In [None]:
X_train_predict = classifier1.predict(X_train)
training_data_accuracy = accuracy_score(X_train_predict, Y_train)
training_data_accuracy

In [None]:
X_test_predict = classifier1.predict(X_test)

test_data_accuracy = accuracy_score(X_test_predict, Y_test)

test_data_accuracy

**Predicting Model**

In [None]:
input_data1 = (6,148,72,35,0,33.6,0.627,50)
input_data2 = (1,85,66,29,0,26.6,0.351,31)

#Changing Input Data As Numpy Array
input_data_array = np.asarray(input_data1)

# Reshape the data as we are predicting for only one instance
input_data_reshape = input_data_array.reshape(1,-1)

# Standardize the data
std_data = scaler.transform(input_data_reshape)

# Prediction
prediction = classifier1.predict(std_data)
#print(prediction)

if prediction == 0:
    print('Person is Not Diabetic')
else:
    print('Person is Diabetic')

**Train the Model**

**(2) Logistic Regression**

In [None]:
classifier2 = LogisticRegression()


classifier2.fit(X_train, Y_train)

**Model Evaluation**

**Determining Accuracy Score**

In [None]:
X_train_predict = classifier2.predict(X_train)

training_data_accuracy = accuracy_score(X_train_predict, Y_train)

print(training_data_accuracy)

**Test Model**

In [None]:
X_test_predict = classifier2.predict(X_test)

test_data_accuracy = accuracy_score(X_test_predict, Y_test)

print(test_data_accuracy)

**Predicting Model**

In [None]:
input_data1 = (6,148,72,35,0,33.6,0.627,50)
input_data2 = (1,85,66,29,0,26.6,0.351,31)

#Changing Input Data As Numpy Array
input_data_array = np.asarray(input_data2)

# Reshape the data as we are predicting for only one instance
input_data_reshape = input_data_array.reshape(1,-1)

# Standardize the data
std_data = scaler.transform(input_data_reshape)

# Prediction
prediction = classifier2.predict(std_data)
#print(prediction)

if prediction == 0:
    print('Person is Not Diabetic')
else:
    print('Person is Diabetic')