In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./input\heart-attack-analysis-prediction-dataset\heart.csv
./input\red-wine-quality-cortez-et-al-2009\winequality-red.csv
./input\restaurant-scores-san-francisco\Restaurant_Scores_-_LIVES_Standard.csv
./input\stroke-prediction-dataset\healthcare-dataset-stroke-data.csv


# Red Wine Quality Prediction using Random Forest
### Loading Dataset and Preparing Dataset
#### 1. Reading File

In [2]:
winequality = pd.read_csv("./input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
winequality.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


#### 2. Splitting into X and y

In [3]:
X=winequality.drop("quality",axis=1)
y=winequality["quality"]

#### 3. Splitting into Train and Test Data

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1279, 11), (320, 11), (1279,), (320,))

### Fitting into Model

In [5]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,y_train)

RandomForestClassifier()

In [6]:
model.score(X_test,y_test)

0.7

### Prediction

In [7]:
y_preds = model.predict(X_test)
y_preds[:20]

array([5, 6, 5, 5, 5, 6, 5, 6, 7, 5, 6, 6, 5, 6, 7, 6, 5, 5, 5, 6],
      dtype=int64)

### Classification Report and Confusion Matrix

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        13
           5       0.80      0.79      0.80       156
           6       0.59      0.74      0.66       111
           7       0.66      0.53      0.58        36
           8       0.00      0.00      0.00         3

    accuracy                           0.70       320
   macro avg       0.34      0.34      0.34       320
weighted avg       0.67      0.70      0.68       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
conf_matrix = confusion_matrix(y_test,y_preds)
print(conf_matrix)

[[  0   0   1   0   0   0]
 [  0   0   8   5   0   0]
 [  0   0 123  33   0   0]
 [  0   0  20  82   9   0]
 [  0   0   1  16  19   0]
 [  0   0   0   2   1   0]]


In [10]:
accuracy_score(y_test,y_preds)

0.7

### Prediction Probability

In [11]:
model.predict_proba(X_test[:5])

array([[0.01, 0.03, 0.84, 0.11, 0.01, 0.  ],
       [0.  , 0.15, 0.39, 0.46, 0.  , 0.  ],
       [0.  , 0.  , 0.65, 0.35, 0.  , 0.  ],
       [0.04, 0.1 , 0.45, 0.37, 0.04, 0.  ],
       [0.05, 0.23, 0.51, 0.19, 0.02, 0.  ]])

In [12]:
np.random.seed(42)
from sklearn.model_selection import cross_val_score
for est in range(10,200,10):
    print(f"Trying with {est} estimators:")
    model = RandomForestClassifier(n_estimators= est).fit(X_train,y_train)
    score = model.score(X_test,y_test)*100
    print(f"Model Accuracy Score: {score}%")
    crossvalscore = np.mean(cross_val_score(model,X,y,cv=5)) *100
    print(f"Cross-Validation Score: {crossvalscore}%\n")

Trying with 10 estimators:
Model Accuracy Score: 65.3125%
Cross-Validation Score: 55.84561128526646%

Trying with 20 estimators:
Model Accuracy Score: 68.75%
Cross-Validation Score: 56.348354231974916%

Trying with 30 estimators:
Model Accuracy Score: 71.5625%
Cross-Validation Score: 56.28546238244514%

Trying with 40 estimators:
Model Accuracy Score: 67.5%
Cross-Validation Score: 55.34874608150471%

Trying with 50 estimators:
Model Accuracy Score: 70.625%
Cross-Validation Score: 57.848942006269596%

Trying with 60 estimators:
Model Accuracy Score: 67.1875%
Cross-Validation Score: 58.78683385579937%

Trying with 70 estimators:
Model Accuracy Score: 70.3125%
Cross-Validation Score: 56.160658307210035%

Trying with 80 estimators:
Model Accuracy Score: 69.375%
Cross-Validation Score: 58.09972570532915%

Trying with 90 estimators:
Model Accuracy Score: 70.0%
Cross-Validation Score: 56.78605015673981%

Trying with 100 estimators:
Model Accuracy Score: 68.75%
Cross-Validation Score: 55.28330

### Finding Best Estimator

In [13]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [i for i in range(10,200,10)]}
grid = GridSearchCV(RandomForestClassifier(),
                   param_grid,
                   cv=5)
grid.fit(X,y)

grid.best_params_

{'n_estimators': 150}

In [14]:
model = grid.best_estimator_
model = model.fit(X_train,y_train)

model.score(X_test,y_test)

0.696875

### Save Model and Reload Model

In [15]:
import pickle
pickle.dump(model,open("RedWineModel.pkl","wb"))

In [16]:
loadmodel = pickle.load(open("RedWineModel.pkl","rb"))
loadmodel.score(X_test,y_test)

0.696875