In [1]:
import pandas as pd

Read training data into Python

In [2]:
data = pd.read_csv("pima-indians-diabetes.csv")

Check types of the variables

In [3]:
data.dtypes

index         int64
preg          int64
plasma        int64
pressure      int64
skin          int64
insulin       int64
bmi         float64
pedigree    float64
age           int64
class         int64
dtype: object

In [6]:
data.head(5)

Unnamed: 0,index,preg,plasma,pressure,skin,insulin,bmi,pedigree,age,class
0,67,2,109,92,0,0,42.7,0.845,54,0
1,165,6,104,74,18,156,29.9,0.722,41,1
2,577,2,118,80,0,0,42.9,0.693,21,1
3,545,8,186,90,35,225,34.5,0.423,37,1
4,42,7,106,92,18,0,22.7,0.235,48,0


### Data info
`preg`: Number of times pregnant

`plasma`: plasma

`pressure`: pressure

`skin`: skin

`insulin`: insulin

`bmi`: bmi

`pedigree`: pedigree

`age`: age

`class`: Label (1 for diabetes)

In [8]:
# Add here the cols to include for training
cols_to_include = "preg	plasma	pressure	skin	insulin	bmi	pedigree	age".split()
X = data[cols_to_include].values
Y = data["class"]

In [26]:
Y.shape[0]-Y.sum(),Y.sum() # number of 0s vs 1s

(347, 190)

In [9]:
X.shape

(537, 8)

In [13]:
Y.shape

(537,)

In [27]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
scaler.fit(X)

# Transform the data
X = scaler.transform(X)

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
max_depth_list = np.arange(1,100,10)
cv_scores = []
for depth in max_depth_list:
    RF = RandomForestClassifier(max_depth=depth)
    cv_scores.append(np.mean(cross_val_score(RF, X, Y, cv=10, scoring="f1")))

In [35]:
best_max_depth = max_depth_list[np.argmax(cv_scores)]
best_max_depth
cv_scores

[0.2428788602701646,
 0.6525635033529771,
 0.6375219429089398,
 0.6432117650500003,
 0.6402635599694424,
 0.6529701181520068,
 0.6511275941924849,
 0.6476072301036433,
 0.652939615528908,
 0.6407011211731325]

In [37]:
# Implement here model evaluation
RF = RandomForestClassifier(max_depth=best_max_depth)
RF.fit(X, Y)

In [42]:
Y_train_pred = RF.predict(X)

In [47]:
from sklearn.metrics import precision_score, recall_score, f1_score

Y_train_true = np.array(Y)
precision = precision_score(Y_train_true, Y_train_pred)
recall = recall_score(Y_train_true, Y_train_pred)
f1 = f1_score(Y_train_true, Y_train_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 1.0
Recall: 1.0
F1 Score: 1.0


Read eval data into Python and save the predictions

In [50]:
eval = pd.read_csv("pima-indians-diabetes_eval_no_class.csv")
eval.head(1)

Unnamed: 0,index,preg,plasma,pressure,skin,insulin,bmi,pedigree,age
0,171,6,134,70,23,130,35.4,0.542,29


In [53]:
# Add here the predictions for the model
X_test = eval[cols_to_include]
scaler = MinMaxScaler()
scaler.fit(X_test)
X_test = scaler.transform(X_test)
eval['class'] = RF.predict(X_test)

In [54]:
eval.to_csv("predictions.csv", index=False)