## 1. Dimensionality Reduction -> Clustering: 
#### 1. t-SNE and PCA 
Both are dimensionality reduction method used for visualization to find groups in high dimensional data. 

t-SNE differs from PCA by preserving only small pairwise distances or local similarities whereas PCA is concerned with preserving large pairwise distances to maximize variance. 

- PCA: PCA is a linear dimension reduction technique that seeks to maximize variance and preserves large pairwise distances. Things that are different end up far apart.

- t-SNE: The t-SNE algorithm calculates a similarity measure between pairs of instances in the high dimensional space and in the low dimensional space. It then tries to optimize these two similarity measures using a cost function. 

LINK: https://www.kdnuggets.com/2018/08/introduction-t-sne-python.html

##### T-SNE,  Manifold - Unsupervised Cluster Visualization and Identification

In [None]:
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 
import seaborn as sns

from sklearn import datasets
from sklearn import manifold 
%matplotlib inline

In [None]:
data = datasets.fetch_openml('mnist_784', version = 1, return_X_y=True)

In [None]:
pixel_vals, targets = data
targets = targets.astype(int)

In [None]:
pixel_vals.shape

In [None]:
img1 = pixel_vals[2,:].reshape(28,28)

In [None]:
plt.imshow(img1, cmap = 'gray')

In [None]:
tsne = manifold.TSNE(n_components=2, random_state=101)

In [None]:
transformed_data= tsne.fit_transform(pixel_vals[:3000, :])

In [None]:
transformed_data

In [None]:
tsne_df = pd.DataFrame(np.column_stack((transformed_data, targets[:3000])), columns = ['x', 'y', 'targets'])

In [None]:
tsne_df.loc[:, 'targets'] = tsne_df.targets.astype(int)

In [None]:
grid = sns.FacetGrid(tsne_df, hue = 'targets', size=8)
grid.map(plt.scatter, 'x','y').add_legend()

## 2. Overfitting by cross validation

In [None]:
import pandas as pd 
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
df.head()

In [None]:
df['quality'].unique()

In [None]:
quality_mapping = {
    3:0,
    4:1,
    5:2,
    6:3,
    7:4,
    8:5    
}

In [None]:
df.loc[:, "quality"] = df.quality.map(quality_mapping)

In [None]:
df.head()

In [None]:
df.quality.value_counts().plot.barh()
plt.legend()

### Splitting the dataframe in training and validation

In [None]:
df = df.sample(frac=1).reset_index(drop = True)  # frac=1,  reshuffling the dataframe
df_train = df.head(1000)
df_test = df.tail(599)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [None]:
def accuracy_by_depth(train_df,test_df, depth_params):
    '''
    This function calculates the accuracy of the decision tree based on the depth parameter which can be 
    provided by the user. 
    
    arguments: 
    train_df -- train data frame,
    test_df -- test data frame, 
    depth_params -- depth of the tree
    returns:
    None, Decision tree accuracy parameters 
    '''
    ### Fitting the classifier
    clf = DecisionTreeClassifier(max_depth=depth_params)
    cols = list(df_train.drop('quality', axis = 1).columns)
    clf.fit(df_train[cols], df_train['quality'])
    
    ### Classifiers Predictions
    train_preds = clf.predict(df_train[cols]) ### train preds 
    test_preds = clf.predict(df_test[cols]) ### test preds 
    
    ### Accuracy of Classifier
    train_acc = metrics.accuracy_score(df_train.quality, train_preds) ## train accuracy 
    test_acc = metrics.accuracy_score(df_test.quality, test_preds) ## test accuracy
    
    print("Decision Tree depth: ", depth_params, "\nTrain Accuracy: ", train_acc, '\nTest Accuracy: ', test_acc)
    
    return train_acc, test_acc

In [None]:
acc_train, acc_test = accuracy_by_depth(train_df=df_train, test_df=df_test, depth_params=3)
# accuracy_by_depth(train_df=df_train, test_df=df_test, depth_params=7)

In [None]:
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
## Global size of the label text
matplotlib.rc('xtick', labelsize=20)
matplotlib.rc('ytick', labelsize=20)

In [None]:
train_accuracies = [.50]
test_accuracies = [.50]
for depth in range(1,25):
    acc_train, acc_test= accuracy_by_depth(train_df=df_train, test_df=df_test, depth_params=depth)
    train_accuracies.append(acc_train)
    test_accuracies.append(acc_test)

In [None]:
plt.figure(figsize=(10,5))
sns.set_style("whitegrid")
# plot
plt.plot(train_accuracies, label = 'train accuracy')
plt.plot(test_accuracies, label = 'test accuracy')
plt.legend(loc = 'upper left', prop = {'size': 15})

plt.xticks(range(0, 25, 5))
# Cosmetics 

plt.xlabel("max depth", size = 20)
plt.ylabel('accuracy', size = 20)
plt.show()