### Algebra

In [62]:
# Algebra - reunion of broken parts

# Scalar & Vector
# A vector quantity has a direction and a magnitude, while a scalar has only a magnitude.

# A row vector is a 1xn matrix, as it has 1 row and some number of columns. 
# A column vector is simply a vector whose components are listed vertically in a single column.

In [1]:
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

In [65]:
data = np.random.randint(0,10,(10,3))
data

# here data is a set of vectors .. and a set of vectors is a matrix

array([[2, 1, 9],
       [5, 1, 5],
       [2, 4, 3],
       [4, 0, 4],
       [0, 7, 5],
       [0, 4, 1],
       [5, 4, 6],
       [7, 2, 0],
       [1, 9, 8],
       [5, 1, 5]])

In [66]:
np.add(data,2)
np.add(data[0],2)

# we can do mathematical operations on both vector and matrix

array([[ 4,  3, 11],
       [ 7,  3,  7],
       [ 4,  6,  5],
       [ 6,  2,  6],
       [ 2,  9,  7],
       [ 2,  6,  3],
       [ 7,  6,  8],
       [ 9,  4,  2],
       [ 3, 11, 10],
       [ 7,  3,  7]])

array([ 4,  3, 11])

In [67]:
np.subtract(data,2)
np.subtract(data[0],2)

array([[ 0, -1,  7],
       [ 3, -1,  3],
       [ 0,  2,  1],
       [ 2, -2,  2],
       [-2,  5,  3],
       [-2,  2, -1],
       [ 3,  2,  4],
       [ 5,  0, -2],
       [-1,  7,  6],
       [ 3, -1,  3]])

array([ 0, -1,  7])

In [68]:
np.multiply(data,2)
np.multiply(data[0],2)

array([[ 4,  2, 18],
       [10,  2, 10],
       [ 4,  8,  6],
       [ 8,  0,  8],
       [ 0, 14, 10],
       [ 0,  8,  2],
       [10,  8, 12],
       [14,  4,  0],
       [ 2, 18, 16],
       [10,  2, 10]])

array([ 4,  2, 18])

In [69]:
np.power(data,2)
np.power(data[0],2)

array([[ 4,  1, 81],
       [25,  1, 25],
       [ 4, 16,  9],
       [16,  0, 16],
       [ 0, 49, 25],
       [ 0, 16,  1],
       [25, 16, 36],
       [49,  4,  0],
       [ 1, 81, 64],
       [25,  1, 25]], dtype=int32)

array([ 4,  1, 81], dtype=int32)

### Linear Regression

In [70]:
data = np.random.randint(0,30,(10,5))
data

# 10 rows and 5 columns .. 10x5 matrix

array([[ 9, 15,  2, 29, 29],
       [11, 12,  8,  0, 13],
       [19,  2,  0,  1,  8],
       [21, 15, 27, 10, 16],
       [15,  0, 13,  9, 16],
       [19,  7, 10, 23, 25],
       [10, 16,  8, 16, 25],
       [ 6, 28, 26,  2, 18],
       [ 7, 29,  1,  7, 23],
       [15,  3, 29,  0, 22]])

In [71]:
# Let's consider first 4 columns as x1,x2,x3,x4 and 5th column as y
# to access the y value
y = data[:,4]
y

array([29, 13,  8, 16, 16, 25, 25, 18, 23, 22])

In [81]:
# to access x1,x2,x3,x4
x = data[:,0:4]
x

array([[ 9, 15,  2, 29],
       [11, 12,  8,  0],
       [19,  2,  0,  1],
       [21, 15, 27, 10],
       [15,  0, 13,  9],
       [19,  7, 10, 23],
       [10, 16,  8, 16],
       [ 6, 28, 26,  2],
       [ 7, 29,  1,  7],
       [15,  3, 29,  0]])

#### When there is only one x variable we can represent the relation b/w x and y in a two dimensional space. 

<img src='img/linreg.jpg'/>

In [None]:
# we can draw a straight line representing majority of the data set and this is the 
# basic concept of linear regression

# when we talk about 3 dimensional space instead of straight line we can visualize a plane

<img src='img/linreg2.png'/>

### Equation for line -> Y = MX + C

In [None]:
# a line can be represented using the formulae y=mx+c , if there are x1 x2 x3 .. 
# then y = m1x1 + m2x2 + m3x3

# the crux of linear regression is to find the beta coefficients, i.e. m1, m2, m3 etc ...

In [83]:
# in the above example shape of x & y is 10x4 and 10x1
x.shape
y.shape
# considering y = mx (ignore + c for now) what should be the shape of m ? 
# (10,1) = (10,4) * (4*1) --> shape of m should be (4*1)

(10, 4)

(10,)

In [None]:
# Now let's see how we figure out the value of m 
# y = xm
# Identify Matrix is a matrix equivalent to 1 in numbers. i.e. any number multiplied by 1 will result in the 
# same number
# To get identity matrix for a square matrix , we can multiple any square matrix with the inverse of the matrix. 
# Given any matrix we can convert it to a square matrix by multipling it with its own transpose
# y = xm
# x.T * y = x.T * x * m #multiply with transpose of x on either sides

# inv(x.T * x) * x.T * y = inv(x.T * x) * (x.T * x) * m # multiply with inv(x.T * x) on either sides .. 
# inv is inverse

# any matrix multiplied with it's inverse is an identity matrix and equivalent to 1 so in right hand side 
# it's just m

# inv(x.T * x) * x.T * y = m

# here star (*) is nothing but dot product

# i.e. m = inv(X.T.dot(X)).dot(X.T).dot(y)

# ## Read https://online.stat.psu.edu/stat462/node/132/


### Forumula for finding coefficient --> m = inv(X.T.dot(X)).dot(X.T).dot(y)

### Let's try it for titanic data set. Let's consider 'y' as fare

In [31]:
import numpy as np
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [32]:
# let's consider fare as y value and all numerical columns as x variables.. 
# find beta coefficients
from preprocessing.memory import MemoryUtil
from preprocessing.nv import NVUtil
from numpy.linalg import inv

In [33]:
excluded_columns,selected_columns,numerical_columns,categorical_columns,df = NVUtil.nv_treatment(df,30)
numerical_columns


Before NV treatment the stats are as below
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
Before NV treatment the stats are as below
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


array(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'],
      dtype=object)

In [34]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


In [35]:
numerical_columns

array(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'],
      dtype=object)

In [36]:
numerical_columns = numerical_columns[numerical_columns!='fare']
numerical_columns

array(['survived', 'pclass', 'age', 'sibsp', 'parch'], dtype=object)

In [37]:
y = df['fare']
x = df[numerical_columns]

In [38]:
m = inv(x.T.dot(x)).dot(x.T).dot(y)
m

array([ 26.95961925, -10.6804178 ,   1.13788122,   9.25881037,
        12.23388595])

In [None]:
# we should apply these values of m , calculate the y and find the difference b/w actual y value and the 
# calculated y value. The sum of all such difference is the total 'error'