## Model Quality and Improvements -project-Simeon Omeda

# Problem Statement
As a data professional working for a pharmaceutical company, you need to develop a
model that predicts whether a patient will be diagnosed with diabetes. The model needs
to have an accuracy score greater than 0.85.


In [None]:
# Data Importation
import pandas as pd
import numpy as np
df = pd.read_csv('https://bit.ly/DiabetesDS')
df.head()



*   Data Exploration



In [3]:
# Checking the first 10 rows

df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [4]:
# Checking the last 10 rows

df.tail(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
758,1,106,76,0,0,37.5,0.197,26,0
759,6,190,92,0,0,35.5,0.278,66,1
760,2,88,58,26,16,28.4,0.766,22,0
761,9,170,74,31,0,44.0,0.403,43,1
762,9,89,62,0,0,22.5,0.142,33,0
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [5]:
# Checking random samples 10 rows 

df.sample(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
403,9,72,78,25,0,31.6,0.28,38,0
295,6,151,62,31,120,35.5,0.692,28,0
735,4,95,60,32,0,35.4,0.284,28,0
143,10,108,66,0,0,32.4,0.272,42,1
540,8,100,74,40,215,39.4,0.661,43,1
703,2,129,0,0,0,38.5,0.304,41,0
425,4,184,78,39,277,37.0,0.264,31,1
196,1,105,58,0,0,24.3,0.187,21,0
28,13,145,82,19,110,22.2,0.245,57,0
264,4,123,62,0,0,32.0,0.226,35,1


In [6]:

# Checking total number of rows and columns of our dataset

df.shape

(768, 9)

In [7]:
# Checking datatypes of our dataset

df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

*   Data Cleaning




In [9]:

# strip leading and trailing spaces and setting all columns to lower
df.columns = df.columns.str.strip().str.lower()
df.head()
     

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:

# Check for null values
df.isna().sum()

pregnancies                 0
glucose                     0
bloodpressure               0
skinthickness               0
insulin                     0
bmi                         0
diabetespedigreefunction    0
age                         0
outcome                     0
dtype: int64

In [11]:

# Check for duplicates
df.duplicated().sum()

0

● Data Preparation


In [14]:
# features and target
features = df.drop(['outcome'], axis = 1)
target = df.loc[:,"outcome"].values



* Data Modeling (Using Decision Trees, Random Forest and Logistic Regression)





In [22]:
## Decision Trees

import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 

features = df.drop(['outcome'], axis = 1)
target = df.loc[:,"outcome"].values

features_valid = df.drop(['outcome'], axis = 1)
target_valid = df.loc[:,"outcome"].values

model = DecisionTreeClassifier(random_state=42,max_depth=5)

model.fit(features, target)


train_predictions = model.predict(features)
test_predictions = model.predict(features_valid)


print('Decision Trees Accuracy Score=', accuracy_score(target_valid, train_predictions))

Decision Trees Accuracy Score= 0.8372395833333334


In [23]:
## Random Forest

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42, n_estimators=3)

model.fit(features, target)
print ('Random Forest Accuracy Score=', model.score(features, target))


Random Forest Accuracy Score= 0.9466145833333334


In [20]:
## Logistic Regression

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

model = LogisticRegression( random_state=42, solver='liblinear')
model.fit(features, target)
print ('Logistic Regression Accuracy Score=', model.score(features, target))

Logistic Regression Accuracy Score= 0.7747395833333334


● Findings and Recommendations


```
Logistic Regression Accuracy Score= 0.7747395833333334
Decision Trees Accuracy Score= 0.8372395833333334
Random Forest Accuracy Score= 0.9466145833333334

Random Forest is the most suitable model because it has an accuracy of more than 0.85
```

