In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
data = pd.read_csv('../input/real-estate-dataset/Real estate.csv')
data.head(5)

In [None]:
data.columns

In [None]:
features=['house age', 'distance to the nearest MRT station','number of convenience stores', 'latitude', 'longitude']
X=data[features]
y=data['house price of unit area']
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

## LINEAR REGRESSION

In [None]:
model1=LinearRegression()

In [None]:
model1.fit(train_X,train_y)

In [None]:
val_predictions1 = model1.predict(val_X)
val_predictions1

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
print("Mean absolute error: ")
print(mean_absolute_error(val_y, val_predictions1))

In [None]:
print("Coefficients") 
print(model1.coef_)
print("Intercept") 
print("%2f"%model1.intercept_)

INTERPRETATION -
In linear regression,


The hypothesis function h(theta)=
theta_0 + theta_1 * x1 + theta_2 * x2 +...
where theta 1 , theta 2 are coefficients and x1,x2 are features.

Here, h = -433.789415 -2.39157343e-01(house age) -4.73776424e-03 (distance to the nearest MRT station)+ 1.09332205e+00(number of convenience stores) + 2.24000967e+02(latitude) -4.20969538e+01(longitude) 

The above function will fit the data given (not perfectly since mean absolute error of 6 is involved).

## DECISION TREE

In [None]:
data = pd.read_csv('../input/bank-note-authentication-uci-data/BankNote_Authentication.csv')
data.head(5)

In [None]:
features=['variance','skewness','curtosis','entropy']
X=data[features]
y=data['class']
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [None]:
# Specify Model
model2 = DecisionTreeClassifier(random_state=1)
# Fit Model
fit=model2.fit(train_X, train_y)

In [None]:
val_predictions2 = model2.predict(val_X)

In [None]:
print("Mean absolute error: ")
print(mean_absolute_error(val_y, val_predictions2))

We have checked error to find the accuracy and it is found to be almost 0.

In [None]:
from sklearn import tree
fig, ax = plt.subplots(figsize=(25, 10))
tree.plot_tree(fit,fontsize=10,filled=True,feature_names=features)
plt.title("Decision Tree",size=25)

INTERPRETATION -

The colour codes (blue and brown) are for 2 classes - authentic and non authentic notes.

The tree is divided by minimising the gini impurity everytime. Gini impurity is a measure of variables that are classified incorrectly. For example, at the root node, variance<=0.765 given the minimum gini(0.494) among all the splits. Once it becomes 0,the group is perfectly homogeneous and there is no further classification(the leaf nodes).

When we need to predict the class, we will check the constraint on every node down the tree to the left and right child till we reach the gini impurity 0 ie leaf node.

## Neural Networks

We are using Keras in the sequential API to define a Neural network that will be train this data with an input dimension of 4 since there are 4 features.We will then have a layer of 16, then 8, then 6, and finally 1(last layer is a vector). The final layer will be activated by a sigmoid function which will push it towards a 1 or a 0. This Neural Network can then be used to predict future values. 

In [None]:
from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential() 

classifier.add(Dense(units = 16, activation ='relu', input_dim = 4))
classifier.add(Dense(units = 8, activation = 'relu'))
classifier.add(Dense(units = 6, activation = 'relu'))
classifier.add(Dense(units = 1, activation = 'sigmoid'))

Here, we have to specify the optimizer and loss function. On each iteration, it measures how well it did in training using the loss function. It then tries to improve on that using the optimizer.

In [None]:
classifier.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy')

We fit the data using fit, passing it the training data -- i.e. for this set of X, this is what the Y should look like. The training itself takes a Fit function. Here in the training we pass x's and y's, and specify how many times it will loop, where a loop is it making a guess at the relationship between the x and the y.It measures how well or how bad it does using the loss function, and then it improves on its guess using the optimizer. 

The NN will then spot the patterns in the data, and build a neural network that could replicate that. 

In [None]:
classifier.fit(train_X, train_y, batch_size = 1, epochs = 50)

To predict new values, the Neural Network uses predict. We are passing the test values for X (which the Neural Network hasn't previously seen) and it will give back a set of predictions.

In [None]:
val_predictions3 = classifier.predict(val_X)

In [None]:
pip install ann_visualizer

In [None]:
classifier.summary()

In [None]:
from keras.utils.vis_utils import plot_model

plot_model(classifier, show_shapes=True,show_layer_names=True)

We calculate the error to get an idea of accuracy

In [None]:
mae=mean_absolute_error(val_y,val_predictions3)
print(mae)

The error is negligible. This implies that we have successfully trained and correctly predicted.

## tSNE on text

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns

In [None]:
tsne = TSNE(n_components=2, random_state=0)
#fitting tSNE model to our data
#this is the same data we used above (bank note authentication)
data_tsne = tsne.fit_transform(data)
#converting into dataframe
df_tsne=pd.DataFrame(data=data_tsne,columns=['X','Y'])
df_tsne['label']=data['Class']

In [None]:
plt.figure(figsize=(6, 5))
g = sns.lmplot(x='X',y='Y',data=df_tsne, fit_reg=False,hue='label', size=6)
plt.title("tSNE plot",size=25)

INTERPRETATION -
We have 2 labels 0 and 1 denoting non authentic and authentic banknotes.
The data was 4 dimensional which was converted to 2D by using tSNE. 

Now, the above plot uses these 2 dimensions of tSNE dataframe. The 2 categories are colour coded using class column of the data. Thus, It is observed thar records of same categories are clustered together(except a few outliers).


## tSNE on image MNIST

In [None]:
from sklearn.datasets import load_digits

Steps followed:
1. Loading data
2. Taking the first 1000 values for convenience(time consuming process)
3. Applying tSNE and fitting the model
4. Converting the fit into dataframe for plotting
5. Plotting the model

In [None]:
df =load_digits()
red_df=df.data[:1000]

### Parameters of tsne
n_components - int, optional (default: 2)
Dimension of the embedded space.

random_state - int, RandomState instance, default=None
Determines the random number generator. Pass an int for reproducible results across multiple function calls. 

perplexity - float, optional (default: 30)
The perplexity is related to the number of nearest neighbors(expected density). 
Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. 
Different values can result in significanlty different results.

learning_rate - float, optional (default: 200.0)
The learning rate for t-SNE is usually in the range [10.0, 1000.0]. 
If the learning rate is too high, the data may look like a ‘ball’ with any point approximately equidistant from its nearest neighbours. 
If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. 

n_iter - int, optional (default: 1000)
Maximum number of iterations for the optimization. Should be at least 250.

In [None]:
model1 = TSNE(n_components=2, random_state=0, perplexity=30, learning_rate=200, n_iter=1000)
model2 = TSNE(n_components=2, random_state=0, perplexity=50, learning_rate=200, n_iter=2000)
tsne1 = model1.fit_transform(red_df)
tsne2 = model2.fit_transform(red_df)
print(tsne1)
print(tsne2)
# fit_transform(self, X[, y])
# Fit X into an embedded space and return that transformed output.

In [None]:
df_tsne1 = pd.DataFrame(data=tsne1, columns=["X", "Y"])
df_tsne1['label']=df.target[:1000]
df_tsne1

In [None]:
df_tsne2 = pd.DataFrame(data=tsne2, columns=["X", "Y"])
df_tsne2['label']=df.target[:1000]
df_tsne2

In [None]:
g = sns.lmplot(x='X',y='Y',data=df_tsne1, fit_reg=False, hue='label',size=6)
plt.title("tSNE plot 1",size=25)

The various colours denote the various digits like 0,1,2,3,4...etc.
Points corresponding to same digit are clustered together.

We have applied model 2 below with increased perplexity and number of iterations to see if the accuracy is increased.

In [None]:
g = sns.lmplot(x='X',y='Y',data=df_tsne2, fit_reg=False,hue='label', size=6)
plt.title("tSNE plot 2",size=25)

We observe that the images corresponding to the different digits are separated into different clusters of points.There is very little overlapping because of some similarity. For example, all the blue(dark) points and pink points that represent 0 and 6 respectilvely are very separate whereas there are some outliers of 9 and 1.

## ERROR METRICS

When evaluating a clustering algorithm we have 2 cases -
1. when we know the actual class variables, we can use homogeneity,completeness and v_measure score
2. If we don't know ground truth labels, we need to use silhouette score.

### SILHOUETTE SCORE
Returns the mean of all Silhouette coefficients.


Coefficient of a sample s= (b - a)/max(a,b) where,


a is the average distance between s and all the other data points in the cluster to which s belongs

b is the minimum average distance from s to all clusters to which it does not belong

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score, v_measure_score

CASE 1

In [None]:
df=data
#dropping class column to perform k means
df.drop('Class',axis=1)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(df)

In [None]:
#kmeans.labels_

In [None]:
h=homogeneity_score(data['Class'], kmeans.labels_)
h

In [None]:
c=completeness_score(data['Class'], kmeans.labels_)
c

In [None]:
print(v_measure_score(data['Class'], kmeans.labels_) )
#harmonic mean of completeness and homogeneity
2/((1/h)+(1/c))

In [None]:
silhouette_score(df, kmeans.labels_)

The above scores indicate that there is very poor clustering probably because k means assumes that all classes have the same variance i.e. each cluster has roughly equal number of observations.

In [None]:
data['Class'].value_counts()

CASE 2

In [None]:
df2 = pd.read_csv('../input/wholesale-customers-data/Wholesale customers data.csv')
df2.head()

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(df2)

In [None]:
silhouette_score(df2, kmeans.labels_)

From silhouette score, it can be said that the clustering performance is moderate.

The number of clusters is taken as 2 previously. Using the Silhouette score for different no. of clusters we can check the appropriate k to be used.

In [None]:
from yellowbrick.cluster import KElbowVisualizer
model = KMeans(random_state=0)

# Call the KElbowVisualizer with the silhouette metric 
viz = KElbowVisualizer(model, k=(2,6), metric='silhouette', timings=False)

# Fit the data and visualize
viz.fit(df2)
viz.poof() 

Clearly k should be 2 as it has the highest silhouette score.