In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import datasets
from sklearn import manifold

%matplotlib inline

# Overfitting

To explain overfitting, I think itâ€™s best if we look at a dataset. There is a red winequality dataset2 which is quite famous. This dataset has 11 different attributes that
decide the quality of red wine.
These attributes include:
* fixed acidity
* volatile acidity
* citric acid
* residual sugar
* chlorides
* free sulfur dioxide
* total sulfur dioxide
* density
* pH
* sulphates
* alcohol

Based on these different attributes, we are required to predict the quality of red wine
which is a value between 0 and 10.

In [None]:
Data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
Data.head(5)

In [None]:
p = Data['quality'].unique()
print(p)

This dataset, however, consists of only six
types of quality values. We will thus map all quality values from 0 to 5.

In [None]:
quality_mapping = {
    3:0,
    4:1,
    5:2,
    6:3,
    7:4,
    8:5
}

Data.loc[:,"quality"] = Data.quality.map(quality_mapping)

In [None]:
# use sample with frac=1 to shuffle the dataframe
# we reset the indices since they change after
# shuffling the dataframe
Data = Data.sample(frac = 1).reset_index(drop=True)
Data.head()

In [None]:
# top 1000 rows are selected
# for training
data_train = Data.head(1000)
# bottom 599 values are selected
# for testing/validation
data_test = Data.tail(599)

We will now train a decision tree model on the training set. For the decision tree
model, I am going to use scikit-learn

In [None]:
# import from scikit-learn
from sklearn import tree
from sklearn import metrics
# initialize decision tree classifier class
# with a max_depth of 3
clf = tree.DecisionTreeClassifier(max_depth=3)
# choose the columns you want to train on
# these are the features for the model
cols = ['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']
  
# train the model on the provided features
# and mapped quality from before
clf.fit(data_train[cols],data_train.quality)
    


 I have used a max_depth of 3 for the decision tree classifier. 
 
 I have left all other parameters of this model to its default value.
 Now, we test the accuracy of this model on the training set and the test set:


In [None]:
# generate predictions on the training set
train_predictions = clf.predict(data_train[cols])

# generate predictions on the test set
test_predictions = clf.predict(data_test[cols])

# calculate the accuracy of predictions on
# training data set

training_accuracy = metrics.accuracy_score(
data_train.quality,train_predictions
)

# calculate the accuracy of predictions on
# test data set
test_accuracy = metrics.accuracy_score(
data_test.quality, test_predictions
)

In [None]:
print(test_accuracy,training_accuracy)

The training and test accuracies are found to be 58.9% and 54.25%. Now we
increase the max_depth to 7 and repeat the process. This gives training accuracy of
76.6% and test accuracy of 57.3%

we calculate these accuracies for different values of max_depth and
make a plot

In [None]:
# import scikit-learn tree and metrics
from sklearn import tree
from sklearn import metrics
# import matplotlib and seaborn
# for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# this is our global size of label text
# on the plots
matplotlib.rc('xtick', labelsize=20)
matplotlib.rc('ytick', labelsize=20)
# This line ensures that the plot is displayed
# inside the notebook
%matplotlib inline
# initialize lists to store accuracies
# for training and test data
# we start with 50% accuracy
train_accuracies = [0.5]
test_accuracies = [0.5]
# iterate over a few depth values
for depth in range(1, 25):
 # init the model
 clf = tree.DecisionTreeClassifier(max_depth=depth)
 # columns/features for training
 # note that, this can be done outside
 # the loop
 cols = [
 'fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol'
 ]
 # fit the model on given features
 clf.fit(data_train[cols], data_train.quality)
 # create training & test predictions
 train_predictions = clf.predict(data_train[cols])
 test_predictions = clf.predict(data_test[cols])
 # calculate training & test accuracies
 train_accuracy = metrics.accuracy_score(
 data_train.quality, train_predictions
 )
 test_accuracy = metrics.accuracy_score(
 data_test.quality, test_predictions
 )

 # append accuracies
 train_accuracies.append(train_accuracy)
 test_accuracies.append(test_accuracy)
# create two plots using matplotlib
# and seaborn
plt.figure(figsize=(10, 5))
sns.set_style("whitegrid")
plt.plot(train_accuracies, label="train accuracy")
plt.plot(test_accuracies, label="test accuracy")
plt.legend(loc="upper left", prop={'size': 15})
plt.xticks(range(0, 26, 5))
plt.xlabel("max_depth", size=20)
plt.ylabel("accuracy", size=20)
plt.show()


We see that the best score for test data is obtained when max_depth has a value of 14. As we keep increasing the value of this parameter, test accuracy remains the
same or gets worse, but the training accuracy keeps increasing. It means that our
simple decision tree model keeps learning about the training data better and better
with an increase in max_depth, but the performance on test data does not improve
at all. 


This is called overfitting.