# Random Forest Project

### Predicting diabetes

In [40]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

#### Step 1: Loading the dataset

In [42]:
data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [43]:
# Obtain dimensions
data.shape

(768, 9)

In [44]:
print(f"The DataFrame contains {len(data)} records (rows) and {len(data.columns)} variables (columns).\n")

The DataFrame contains 768 records (rows) and 9 variables (columns).



In [45]:
# Obtain information about data types and non-null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [46]:
data_c=data.copy()
data_c.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Eliminate duplicates

In [47]:
data_c = data.drop_duplicates().reset_index(drop = True)
data_c.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [48]:
data_c.shape

(768, 9)

In [49]:
numerical_columns=[data_c.columns]
numerical_columns

[Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
        'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
       dtype='object')]

#### Step 2: Build a random forest

In [50]:
X = data_c.drop(['Outcome'], axis=1)
Y = data_c[['Outcome']]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)

In [63]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [64]:
predictions = rf.predict(X_test)

In [65]:
confusion_m = pd.DataFrame(confusion_matrix(y_test, predictions),columns=['No Diabetes', 'Diabetes'], index=['No', 'Yes'])
confusion_m

Unnamed: 0,No Diabetes,Diabetes
No,66,10
Yes,15,25


In [66]:
accuracy_score(y_test, predictions)

0.7844827586206896

In [39]:
print("The base random forest is better than the optimized decision tree.")

The base random forest is better than the optimized decision tree.


#### Step 3: Save the model

In [68]:
from pickle import dump

dump(rf, open("../models/ranfor_classifier_nestimators-200.sav", "wb"))