# Linear Algebra

- Representing vector and matrix in python
- Transpose, dot product, determinant, inverse

In [1]:
import numpy as np

In [2]:
# n-dimensional vector
vector = np.array([2, 3, 5, 7, 10])
print("Vector :\t",vector)

# Transpose of a vector
vector_T = vector.transpose()
print("Vector Transpose :\t",vector_T)

Vector :	 [ 2  3  5  7 10]
Vector Transpose :	 [ 2  3  5  7 10]


In [3]:
# 3x3 matrix
M = np.matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print("Matrix :\n",M)

# 3x3 matrix transpose
M_T = M.transpose()
print("Matrix Transpose :\n",M_T)

Matrix :
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
Matrix Transpose :
 [[1 4 7]
 [2 5 8]
 [3 6 9]]


## Vector dot product

<img src="dot_product.png">

In [4]:
x = np.array([2, 4, 5, 8])
y = np.array([1, 0, -5, 0.5])

x_dot_y = np.dot(x, y)

print("x_dot_y: \t",x_dot_y)
print(type(x_dot_y))

x_dot_y: 	 -19.0
<class 'numpy.float64'>


## Matrix product

<img src="matrix_product.png">

In [5]:
A = np.matrix([[1, 2, 3], [4, 5, 6]])
B = np.matrix([[1, 0, -1, -1], [0, 0, 0, 0], [1, 99, 9, 3]])

print("A :\t",A.shape)
print("B :\t",B.shape)

A :	 (2, 3)
B :	 (3, 4)


In [6]:
AB = np.matmul(A, B)

print("AB: \n",AB)
print('\ntype:\t',type(AB))
print("\nAB :\t",AB.shape)

AB: 
 [[  4 297  26   8]
 [ 10 594  50  14]]

type:	 <class 'numpy.matrixlib.defmatrix.matrix'>

AB :	 (2, 4)


In [7]:
AB = np.dot(A, B)

print("AB: \n",AB)
print('\ntype:\t',type(AB))
print("\nAB :\t",AB.shape)

AB: 
 [[  4 297  26   8]
 [ 10 594  50  14]]

type:	 <class 'numpy.matrixlib.defmatrix.matrix'>

AB :	 (2, 4)


In [8]:
# Element wise product
A = np.matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
B = np.matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

AB = np.multiply(A, B)

print("AB: \n",AB)
print('\ntype:\t',type(AB))
print("\nAB :\t",AB.shape)

AB: 
 [[ 1  4  9]
 [16 25 36]
 [49 64 81]]

type:	 <class 'numpy.matrixlib.defmatrix.matrix'>

AB :	 (3, 3)


## Determinant


In [9]:
# 3x3 matrix
M = np.array([[6,1,1], [4, -2, 5], [2,8,7]])
print("Matrix :\n",M)


Matrix :
 [[ 6  1  1]
 [ 4 -2  5]
 [ 2  8  7]]


In [10]:
print(6*(-2*7 - 5*8) - 1*(4*7 - 5*2) + 1*(4*8 - -2*2))

-306


In [11]:
np.linalg.det(M)

-306.0

## Inverse of a Matrix

In [12]:
# 3x3 matrix
M = np.array([[6,1,1], [4, -2, 5], [2,8,7]])
print("Matrix :\n",M)

Matrix :
 [[ 6  1  1]
 [ 4 -2  5]
 [ 2  8  7]]


In [13]:
M_inverse = np.linalg.inv(M) 
print("M_inverse :\n",M_inverse)

M_inverse :
 [[ 0.17647059 -0.00326797 -0.02287582]
 [ 0.05882353 -0.13071895  0.08496732]
 [-0.11764706  0.1503268   0.05228758]]


### Is *M_inverse* correct ??

### If '*M_dot_M_inverse*' is an identity matrix then we can conclude that '*M_inverse*' is correct.

In [14]:
M_dot_M_inverse = np.dot(M, M_inverse)
print("M_dot_M_inverse :\n",M_dot_M_inverse)

M_dot_M_inverse :
 [[1.00000000e+00 0.00000000e+00 6.93889390e-18]
 [2.77555756e-17 1.00000000e+00 4.85722573e-17]
 [8.32667268e-17 1.11022302e-16 1.00000000e+00]]


### Try it yourself

- Find the vector matrix product of A and M (where A is a vector and M is a matrix).
  where,
         A = [1,-1,0]
         M = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
         
- Find the matrix product of 3 matrices A, B and C (i.e. A_dot_B_dot_C).
  where, 
         A = [[1, 2]]
         B = [[2,3,4],[5,6,7]]
         C = [[-1, 1, -1, 1], [0, 0, 0, 0], [1, 1, 1, 1]]
         


In [54]:
# Find the vector matrix product of A and M (where A is a vector and M is a matrix)

A = np.matrix([1,-1,0])
M = np.matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

AM = np.dot(A,M)

print("A shape :\t",A.shape)
print("M shape :\t",M.shape)
print("AM shape :\t",AM.shape)
print("AM :\t",AM)

A shape :	 (1, 3)
M shape :	 (3, 3)
AM shape :	 (1, 3)
AM :	 [[-3 -3 -3]]


In [56]:
# Find the matrix product of 3 matrices A, B and C (i.e. A_dot_B_dot_C)

A = np.matrix([1, 2])
B = np.matrix([[2,3,4],[5,6,7]])
C = np.matrix([[-1, 1, -1, 1], [0, 0, 0, 0], [1, 1, 1, 1]])

A_dot_B = np.dot(A,B)
A_dot_B_dot_C = np.dot(A_dot_B,C)

print("A shape :\t",A.shape)
print("B shape :\t",B.shape)
print("C shape :\t",C.shape)
print("A_dot_B shape :\t",A_dot_B.shape)
print("A_dot_B_dot_C shape :\t",A_dot_B_dot_C.shape)
print("A_dot_B :\t",A_dot_B)
print("A_dot_B_dot_C :\t",A_dot_B_dot_C)

A shape :	 (1, 2)
B shape :	 (2, 3)
C shape :	 (3, 4)
A_dot_B shape :	 (1, 3)
A_dot_B_dot_C shape :	 (1, 4)
A_dot_B :	 [[12 15 18]]
A_dot_B_dot_C :	 [[ 6 30  6 30]]


# Generative Models 1

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the wine dataset
data = pd.read_csv('wine_original.csv')
labels = data['class']
del data['class']

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=10)

In [4]:
print (X_train.shape)
X_train.head()

(142, 13)


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
95,12.47,1.52,2.2,19.0,162,2.5,2.27,0.32,3.28,2.6,1.16,2.63,937
91,12.0,1.51,2.42,22.0,86,1.45,1.25,0.5,1.63,3.6,1.05,2.65,450
24,13.5,1.81,2.61,20.0,96,2.53,2.61,0.28,1.66,3.52,1.12,3.82,845
109,11.61,1.35,2.7,20.0,94,2.74,2.92,0.29,2.49,2.65,0.96,3.26,680
121,11.56,2.05,3.23,28.5,119,3.18,5.08,0.47,1.87,6.0,0.93,3.69,465


In [5]:
y_train.value_counts()

2    53
1    49
3    40
Name: class, dtype: int64

In [6]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Initialize Gaussian Naive Bayes
gnb = GaussianNB()
# Train the classifier
gnb.fit(X_train, y_train)
# Make predictions on test data
y_pred = gnb.predict(X_test)
# Make predictions on training data (to see our fit)
y_train_pred = gnb.predict(X_train)

# print the accuracy
print ('Training accuracy = ' + str(np.sum(y_train_pred == y_train)/len(y_train)))
print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

Training accuracy = 0.9859154929577465
Test accuracy = 0.8888888888888888


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=5)

parameters = { 'alpha' : [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 4, 5, 10] }
mnb = MultinomialNB()
clf = GridSearchCV(mnb, parameters, verbose=True, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#accuracy = np.sum(y_pred == y_test)/len(y_test)
accuracy = accuracy_score(y_pred, y_test)
train_acc = accuracy_score(clf.predict(X_train), y_train)
print ('Test accuracy = ' + str(accuracy))# + ' at alpha = ' + str(alpha))
print ('Train accuracy = ' + str(train_acc)) 
print (clf.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Test accuracy = 0.8333333333333334
Train accuracy = 0.852112676056338
{'alpha': 0.001}


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.2s finished


# How to create a classification problem dataset

In [51]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=20, n_classes = 3, n_informative = 3)

In [52]:
X.shape

(100, 20)

In [53]:
pd.DataFrame(X).head(5)
# pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.651974,0.85283,2.217686,0.396525,-0.446188,0.013384,-1.136327,0.319694,-0.914755,0.311523,-0.046454,-0.170417,0.767824,-0.00593,-0.014942,-1.35543,-0.812687,-0.732592,-0.112316,0.245824
1,-1.369365,-0.830684,-1.176038,-0.629892,0.192238,2.165497,1.591953,-0.118838,-0.742129,0.501616,1.13834,0.975204,-0.132918,1.170067,-0.644785,0.751794,0.299732,1.200675,-2.353202,-0.191087
2,-0.335313,0.988168,-0.056683,-0.146777,-1.115458,-0.41546,-1.008233,-1.415787,-0.491484,-0.115318,-1.735396,-0.722929,0.310783,-0.052385,0.739341,1.137149,2.457265,1.97711,-0.6349,-0.704078
3,-0.870118,1.916596,-0.065091,0.697129,1.684563,-0.130707,1.982868,1.182492,-2.678001,1.057621,-0.52989,1.227657,-1.760857,-0.821769,0.354259,-0.352379,0.386517,0.764791,-1.23055,0.164876
4,-1.185143,1.626507,-1.416213,0.722189,-0.179357,-1.022353,0.621295,0.958578,-1.180366,1.189275,-0.145039,0.559647,1.667523,-1.188217,-0.813369,0.65461,-0.139804,0.38278,1.671296,-0.038772


In [54]:
pd.DataFrame(y).head(5)

Unnamed: 0,0
0,0
1,2
2,2
3,1
4,2


# Try it yourself!

1. Create a classification problem with 3 classes, 15 features and 5000 rows
2. Take the last 1000 rows to be the test set
3. Run Gaussian naive bayes on this problem and report test accuracy
4. Calculate class prior probabilities for each class in training data (first 4k rows)
5. Calculate the probability of the samples for each class in the test set

References:
    http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
    http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB

In [116]:
# 1. Create a classification problem with 3 classes, 15 features and 5000 rows

from sklearn.datasets import make_classification

X, y = make_classification(n_samples=5000, n_features=15, n_classes = 3, n_informative = 3)

In [117]:
# print X

pd.DataFrame(X).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-0.479888,-0.600105,0.303918,1.535755,0.34014,-1.988212,-0.027653,-0.894987,-1.587801,2.467105,-1.136715,-2.083581,-0.3804,1.974672,-0.697588
1,0.633348,0.96229,0.814473,-0.003243,0.929666,-0.744475,-1.337174,-0.366015,1.386391,-0.227363,0.760677,-0.936752,-0.685653,-0.756152,-1.657604
2,0.933183,-0.612267,-1.034197,0.36937,-1.411523,-0.324664,-0.064488,-0.172099,-2.458331,-1.098262,1.258992,0.61109,-0.705192,-1.617141,-0.264329
3,-3.441939,-0.951017,-1.04875,1.443221,-1.250332,0.164343,1.893598,-0.28017,-1.193462,-0.933628,-3.030818,0.813125,4.508485,3.222326,0.472896
4,0.385174,-0.315077,-1.458283,-0.506321,0.473403,-0.748647,-2.977722,0.326998,1.128224,0.548446,-0.833508,-1.124286,-0.551719,-0.902384,-1.05502


In [118]:
# print y

y

array([2, 2, 2, ..., 1, 2, 2])

In [119]:
# print size of X and y

print(X.shape)
print(y.shape)

(5000, 15)
(5000,)


In [120]:
# 2. Take the last 1000 rows to be the test set, and the first 4k rows to be the train set

X_test = X[-1000:,:]
y_test = y[-1000:]

X_train = X[:-1000,:]
y_train = y[:-1000]

In [121]:
# 3. Run Gaussian naive bayes on this problem and report test accuracy

from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Initialize Gaussian Naive Bayes
gnb = GaussianNB()
# Train the classifier
gnb.fit(X_train, y_train)
# Make predictions on test data
y_pred = gnb.predict(X_test)
# Make predictions on training data (to see our fit)
y_train_pred = gnb.predict(X_train)

# print the accuracy
print ('Training accuracy = ' + str(np.sum(y_train_pred == y_train)/len(y_train)))
print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

Training accuracy = 0.845
Test accuracy = 0.833


In [126]:
# 4. Calculate class prior probabilities for each class in training data (first 4k rows)

prob_prior_train = gnb.class_prior_
prob_prior_train

array([0.3305 , 0.33175, 0.33775])

In [127]:
# 5. Calculate the probability of the samples for each class in the test set

prob_test = gnb.predict_proba(X_test)
prob_test

array([[8.72482662e-01, 9.57308927e-02, 3.17864455e-02],
       [2.55717046e-01, 7.14314246e-01, 2.99687078e-02],
       [5.09309066e-08, 9.99996257e-01, 3.69179916e-06],
       ...,
       [9.67700599e-03, 9.89714274e-01, 6.08719607e-04],
       [3.01023719e-02, 8.31934215e-03, 9.61578286e-01],
       [1.96811484e-02, 5.01668944e-02, 9.30151957e-01]])

In [128]:
# find size of probability for test set
prob_test.shape

(1000, 3)