In [None]:
#Importing required packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline
import os

In [None]:
wine_raw = pd.read_csv("../input/winequality-red.csv", low_memory=False)

In [None]:
wine_raw.describe()

In [None]:
#!pip install fastai==0.7.0

In [None]:
from fastai.imports import *
#from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [None]:
bins = [0, 4, 6, 10]
labels = ["poor","normal","excellent"]
wine_raw['quality_cat'] = pd.cut(wine_raw['quality'], bins=bins, labels=labels)
wine_raw = wine_raw.drop('quality', axis = 1)

In [None]:
wine_raw.tail(50)

In [None]:
y = wine_raw['quality_cat']
df = wine_raw.drop('quality_cat', axis=1)

In [None]:
df.sample(7)
#df, y, nas = proc_df(wine_raw, 'quality_cat')

We're going to reshuffle the data just so that we avoid any sort of sequence(just in case)

Note: This is not a time sequence data so it's okay to reshuffle it

In [None]:
wine_raw = wine_raw.sample(frac=1, axis=0).reset_index(drop=True)

In [None]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 399
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(wine_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

**Trying out RandomForest Classifier**

In [None]:
m = RandomForestClassifier(n_jobs=-1)
%time m.fit(X_train, y_train)

In [None]:
import math
#The aim is to reduce the rmse error and increase the score
def rmse(x,y): return math.sqrt((np.subtract(x-y)**2).mean())

def print_score(m):
  res = [#rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
        m.score(X_train, y_train), m.score(X_valid, y_valid)]
  if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
  print(res)

In [None]:
print_score(m)

Its interesting to see that we get 99.5% accuracy in training set and 84.2% accuracy in validation set. This suggests that there could be possible over-fitting of the data.

Let's try and plot the prediction and see how the prediction's increase as we sum over multiple trees.

In [None]:
preds = np.stack([t.predict(X_valid) for t in m.estimators_])
preds[:,0], np.mean(preds[:,0]), y_valid[0]

In [None]:
preds

When we run RandomForestClassifier, it creates 10 decision trees and the output is the mean of all the 10 results. Here, we're taking just one tree and plotting the prediction. The first plot tell the accuracy of a sample data(single row) with just one tree, the second data point is the average of the first and the second tree for the same sample data(1 row), the third is the mean of the first three tree and so on....



In [None]:
y_plot = y_valid.cat.codes.tolist()

In [None]:
y_plot = np.array(y_plot)

In [None]:
y_plot.astype(np.float)

In [None]:
preds1 = preds.astype(float)

In [None]:
metrics.accuracy_score(y_valid, m.predict(X_valid))

Let's try running the same with 20 trees(n_estimator=20)

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=20)
m.fit(X_train, y_train)
print_score(m)
#preds = np.stack([t.predict(X_valid) for t in m.estimators_])
#preds[:,0], np.mean(preds[:,0]), y_valid[0]
#plt.plot([metrics.r2_score(y_valid, np.mean(preds[:i+1], axis = 0)) for i in range(20)])

**Using Extra Tree Regressor**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
e = ExtraTreesClassifier(n_jobs=1)
e.fit(X_train, y_train)
print_score(e)

Handling over-fitting

Out of Bag Technique: In this technique, we get a subsample to train the tree and then test it using the remaining sample. For instance, if we have 100 training data rows, we take 50 subsamples to build a tree and then test the tree with the remaining data. Then we start over again with a different subset.

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=10, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

MaxFeatures
MaxFeatures : This property tells the random forest to consider only a subset of the features(columns) while trying to decide the split. This makes sure that we do not miss cumulative feature dependence, in other words, 2 or more features together can be a powerful indicator compared to one feature.

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=40, oob_score=True, max_features=0.6)
m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=40, oob_score=True, max_features=0.7)
m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=40, oob_score=True, max_features=0.5)
m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=40, oob_score=True, max_features=0.4)
m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=40, oob_score=True, max_features=0.3)
m.fit(X_train, y_train)
print_score(m)

At this point, let's pause and take a look at how a tree looks and where the corresponding split points are:



In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=40, oob_score=True, max_features=0.3, max_depth=3)
m.fit(X_train, y_train)
print_score(m)
#draw_tree(m.estimators_[0], df, precision=3)

In [None]:
#set_rf_samples(300)
m = RandomForestClassifier(n_jobs=-1, min_samples_leaf=3, max_features=0.5,  n_estimators =40, oob_score = True)
%time m.fit(X_train, y_train)
print_score(m)

Feature Importance:

In [None]:
#reset_rf_samples()

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=40, oob_score=True, max_features=0.4)
m.fit(X_train, y_train)
print_score(m)

In [None]:
raw_train.sample(1)

In [None]:
X_train.columns

In [None]:
m.feature_importances_

In [None]:
fi = pd.DataFrame({'cols':X_train.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
#fi = rf_feat_importance(m, X_train)
fi[:15]

In [None]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False )

In [None]:
def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi[:30])

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=100, oob_score=True, max_features=0.6)
m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=1000, oob_score=True, max_features=0.3)
m.fit(X_train, y_train)
print_score(m)