#Significance of scratch

Scratch here means to create a class/function of the same function as an advanced library such as scikit-learn by combining basic libraries such as NumPy.


It is difficult to grasp just by moving a library such as scikit-learn by scratching, and we aim for a deep understanding of the algorithm. It can also improve your coding skills, but that is not the main purpose.


We are aiming for the following effects.


* Make it easier to understand theory and mathematical formulas when encountering new methods
* Reduce ambiguity in using libraries
* Make existing implementations easier to read

#[Problem 1] scratch on train_test_split

In [4]:
def scratch_train_test_split(X, y, train_size=0.8):
    arrays = [X,y]
    random_state=None
    test_size=None
    length = len(arrays[0])
    if random_state:
        np.random.seed(random_state)
    p = np.random.permutation(length)

    if type(test_size) == int:
        index = length - test_size
    elif type(test_size) == float:
        index = length - np.ceil(length * test_size)
    else:
        if type(train_size) == int:
            index = train_size
        elif type(train_size) == float:
            index = int(length * train_size)
        else:
            index = length - np.ceil(length * 0.25)

    X_train, X_test, y_train, y_test = [b for a in arrays for b in (a[p][:index], a[p][index:])]
    pass
    return X_train, X_test, y_train, y_test

In [5]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

iris = load_iris()

df = pd.DataFrame(data=iris.data,columns=iris.feature_names)
df['target'] = iris.target

# filtering the df to get two target variables we need
df_filtered = df[df['target'] > 0] 

y = df_filtered['target']
X = df_filtered.drop(['target'],axis=1)

X = X.to_numpy()
y = y.to_numpy()

X_train_1, X_test_1, y_train_1, y_test_1 = scratch_train_test_split(X, y)

In [6]:
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

X_train_2, X_test_2, y_train_2, y_test_2 = scratch_train_test_split(X, y)

In [7]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

X_train_3, X_test_3, y_train_3, y_test_3 = scratch_train_test_split(X, y)

# Classification problem

Classification scratches three methods.


* Logistic regression
* SVM
* Decision tree

# [Problem 2] Creating a code to solve the classification problem

In [8]:
from sklearn.metrics import accuracy_score

# import classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# creating instances
clf_SDG = make_pipeline(StandardScaler(),
                     SGDClassifier(max_iter=1000, tol=1e-3,loss="log"))
clf_SVM = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_DT = make_pipeline(StandardScaler(),DecisionTreeClassifier(random_state=0))

# for first dataset
clf_SDG.fit(X_train_1, y_train_1)
pred_1_SDG = clf_SDG.predict(X_test_1)
clf_SVM.fit(X_train_1, y_train_1)
pred_1_SVM = clf_SVM.predict(X_test_1)
clf_DT.fit(X_train_1, y_train_1)
pred_1_DT = clf_DT.predict(X_test_1)

print("First dataset, Accuracy_SDG: ", accuracy_score(y_test_1,pred_1_SDG))
print("First dataset, Accuracy_SVM: ", accuracy_score(y_test_1,pred_1_SVM))
print("First dataset, Accuracy_DT: ", accuracy_score(y_test_1,pred_1_DT))

# for second dataset
clf_SDG.fit(X_train_2, y_train_2)
pred_2_SDG = clf_SDG.predict(X_test_2)
clf_SVM.fit(X_train_2, y_train_2)
pred_2_SVM = clf_SVM.predict(X_test_2)
clf_DT.fit(X_train_2, y_train_2)
pred_2_DT = clf_DT.predict(X_test_2)

print("-------------------")
print("Second dataset, Accuracy_SDG: ", accuracy_score(y_test_2,pred_2_SDG))
print("Second dataset, Accuracy_SVM: ", accuracy_score(y_test_2,pred_2_SVM))
print("Second dataset, Accuracy_DT: ", accuracy_score(y_test_2,pred_2_DT))

# for third dataset
clf_SDG.fit(X_train_3, y_train_3)
pred_3_SDG = clf_SDG.predict(X_test_3)
clf_SVM.fit(X_train_3, y_train_3)
pred_3_SVM = clf_SVM.predict(X_test_3)
clf_DT.fit(X_train_3, y_train_3)
pred_3_DT = clf_DT.predict(X_test_3)

print("-------------------")
print("third dataset, Accuracy_SDG: ", accuracy_score(y_test_3,pred_3_SDG))
print("third dataset, Accuracy_SVM: ", accuracy_score(y_test_3,pred_3_SVM))
print("third dataset, Accuracy_DT: ", accuracy_score(y_test_3,pred_3_DT))

First dataset, Accuracy_SDG:  0.95
First dataset, Accuracy_SVM:  0.95
First dataset, Accuracy_DT:  0.95
-------------------
Second dataset, Accuracy_SDG:  1.0
Second dataset, Accuracy_SVM:  1.0
Second dataset, Accuracy_DT:  1.0
-------------------
third dataset, Accuracy_SDG:  0.5
third dataset, Accuracy_SVM:  0.75
third dataset, Accuracy_DT:  0.5


In [10]:
# importing the dataset
df_2 = pd.read_csv('train.csv')

# select columns with numbers
altered_df2 = df_2.select_dtypes("number")

# fill empty cells
altered_df2 = altered_df2.fillna(0)

# check if there are any remaining empty data
altered_df2.isnull().sum()

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

#[Problem 3] Creating a code to solve the regression problem

In [11]:
X = altered_df2[['GrLivArea','YearBuilt']]
y = altered_df2['SalePrice']

X = X.to_numpy()
y = y.to_numpy()

X_train_4, X_test_4, y_train_4, y_test_4 = scratch_train_test_split(X, y)

# creating an instance without the loss=log parameter
clf_SDG = make_pipeline(StandardScaler(),
                     SGDClassifier(max_iter=1000, tol=1e-3))

clf_SDG.fit(X_train_4, y_train_4)
pred_4_SDG = clf_SDG.predict(X_test_4)

print(pred_4_SDG)

[165000 165000 171000 171000 205000 171000 165000 129000 207500 165000
 171000 165000 165000 127500 171000 129000 129000 171000 171000 127500
 147000 165000 171000 171000 207500 165000 171000 171000 165000 165000
 165000 171000 205000 127500 109900 171000 171000 127500 147000 127500
 171000 129000 171000 165000 171000 165000 129000 127500 165000 171000
 171000 165000 129000 171000 205000 129000 129000 127500 127500 171000
 205000 171000  85000 127500 207500 171000 147000 165000 165000 171000
 171000 207500 129000 127500 171000 147000 171000 165000 165000 205000
 171000 171000 317000 165000 171000 127500 171000 165000 205000 171000
 171000 171000 220000 171000 165000 129000 127500 127500 205000 129000
 171000 165000 171000 129000 165000 317000 127500 127500 147000 205000
 165000 129000 207500 129000 171000 127500 171000 171000 127500 129000
 171000 165000 205000 127500 171000 165000 171000 129000 127500 165000
 127500 129000 165000 171000 165000 127500 205000 171000 207500 205000
 10990