In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# import math function
from math import floor

#【‎Problem 1] Scratch‎ train_test_split

In [2]:
def scratch_train_test_split(X, y, train_size=0.8):
    """Divide the validation data.
    Parameters
    ----------
    X : ndarray
      Training data (n_samples, n_features)
    y : ndarray
      Correct answer value (n_samples,)
    train_size : float
      Specify what percentage to use as a train (0 < train_size < 1)
    Returns
    -------
    X_train : ndarray
      Training data (n_samples, n_features)
    X_test : ndarray
      Validation data (n_samples, n_features)
    y_train : ndarray
      Correct answer value of training data (n_samples,)
    y_test : ndarray
      Correct value of verification data (n_samples,)
    """
    size = floor(train_size * y.shape[0])
    assert X.shape[0] > 0 and y.shape[0] > 0, 'Need at least one row in X and y'
    assert X.shape[0] == y.shape[0], 'X and y must have the same n_samples'
    assert train_size >= 0 and train_size < 1, 'Train size must be in[0, 1]'

    train_set = np.random.choice(y.shape[0], size=size, replace=False)
    test_set = [i for i in range(y.shape[0]) if i not in train_set]
    y = np.array(y)
    
    X_train = X[train_set, :]
    X_test  = X[test_set, :]
    y_train = y[train_set]
    y_test  = y[test_set]
    
    return X_train, X_test, y_train, y_test

In [3]:
X, y = np.arange(50).reshape(10, 5), np.arange(10)

X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)
print("Shape of train set: {} -- Shape of test set{}".format(X_train.shape, X_test.shape))

Shape of train set: (8, 5) -- Shape of test set(2, 5)


# [Problem 2] Creating a code to solve the classification problem

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [5]:
def scale(X_train, X_test):
    """ Scale data
    Parameters
    ----------
    model : model
      Model to train data
    X_train : ndarray
      Training data (n_samples, n_features)
    X_test : ndarray
      Validation data (n_samples, n_features)
    Returns
    -------
    """    
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(X_train)
    x_test_scaled = scaler.transform(X_test)
    return x_train_scaled, x_test_scaled

In [6]:
def train_model(model, X_train, X_test, y_train, y_test):
    """Train with specific model
    Parameters
    ----------
    model : model
      Model to train data
    X_train : ndarray
      Training data (n_samples, n_features)
    X_test : ndarray
      Validation data (n_samples, n_features)
    y_train : ndarray
      Correct answer value of training data (n_samples,)
    y_test : ndarray
      Correct value of verification data (n_samples,)
    Returns
    -------
    result: DataFrame
        DataFrame of actual and predicited values
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    result = pd.DataFrame([y_pred, y_test], index=['Predict', 'Actual'])

    print("Score: {}".format(model.score(X_test, y_test)))
    return result

# Iris dataset

In [7]:
from sklearn.datasets import load_iris

iris_data, iris_target = load_iris(return_X_y=True)

iris_X = pd.DataFrame(iris_data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
iris_y = pd.DataFrame(iris_target, columns=['Species'])
iris = pd.concat((iris_X, iris_y), axis=1)

Taking out all samples with species are virgicolor and virginica

In [8]:
iris_X = iris[iris['Species'] != 0].drop(columns=['Species']).values
iris_y = iris_y[iris_y['Species'] != 0].values.reshape(-1)

iris_X_train, iris_X_test, iris_y_train, iris_y_test = scratch_train_test_split(iris_X, iris_y, train_size=0.8)
iris_X_train, iris_X_test = scale(iris_X_train, iris_X_test)

## Stochastic Gradient Descent

In [9]:
train_model(SGDClassifier(loss='log'), iris_X_train, iris_X_test, iris_y_train, iris_y_test)

Score: 0.85


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Predict,1,1,1,2,1,1,1,1,2,2,2,1,2,2,2,2,1,2,2,2
Actual,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2


## Support Vector Classifier

In [10]:
train_model(SVC(), iris_X_train, iris_X_test, iris_y_train, iris_y_test)

Score: 0.9


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Predict,1,1,1,2,1,1,1,1,2,2,2,1,2,2,2,2,2,2,2,2
Actual,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2


## Decision Tree Classifier

In [11]:
train_model(DecisionTreeClassifier(), iris_X_train, iris_X_test, iris_y_train, iris_y_test)

Score: 0.9


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Predict,1,1,1,2,1,1,1,1,2,2,2,1,2,2,2,2,2,2,2,2
Actual,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2


# Simple data set 1

In [12]:
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, n_samples // 2)
f1 = np.random.multivariate_normal(f1, cov, n_samples // 2)
X = np.concatenate([f0, f1])
y = np.concatenate([
    np.full(n_samples // 2, 1),
    np.full(n_samples // 2, -1)
])

In [13]:
set1_X_train, set1_X_test, set1_y_train, set1_y_test = scratch_train_test_split(X, y, train_size=0.8)
set1_X_train, set1_X_test = scale(set1_X_train, set1_X_test)

## Stochastic Gradient Descent

In [14]:
train_model(SGDClassifier(loss='log'), set1_X_train, set1_X_test, set1_y_train, set1_y_test)

Score: 1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
Predict,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Actual,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


## Support Vector Classifier

In [15]:
train_model(SVC(), set1_X_train, set1_X_test, set1_y_train, set1_y_test)

Score: 1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
Predict,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Actual,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


## Decision Tree Classifier

In [16]:
train_model(DecisionTreeClassifier(), set1_X_train, set1_X_test, set1_y_train, set1_y_test)

Score: 1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
Predict,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Actual,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


# Simple data set 2

In [17]:
X = np.array([
    [-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
    [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
    [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
    [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
    [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
    [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
    [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
    [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
    [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
    [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
    [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
    [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
    [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
    [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
    [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
    [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
    [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
    [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
    [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
    [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ],
])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [18]:
set2_X_train, set2_X_test, set2_y_train, set2_y_test = scratch_train_test_split(X, y, train_size=0.8)
set2_X_train, set2_X_test = scale(set2_X_train, set2_X_test)

## Stochastic Gradient Descent

In [19]:
train_model(SGDClassifier(loss='log'), set2_X_train, set2_X_test, set2_y_train, set2_y_test)

Score: 0.5


Unnamed: 0,0,1,2,3,4,5,6,7
Predict,1,1,1,1,1,1,1,1
Actual,0,0,0,0,1,1,1,1


## Support Vector Classifier

In [20]:
train_model(SVC(), set2_X_train, set2_X_test, set2_y_train, set2_y_test)

Score: 0.625


Unnamed: 0,0,1,2,3,4,5,6,7
Predict,0,1,0,1,0,1,1,1
Actual,0,0,0,0,1,1,1,1


## Decision Tree Classifier

In [21]:
train_model(DecisionTreeClassifier(), set2_X_train, set2_X_test, set2_y_train, set2_y_test)

Score: 0.5


Unnamed: 0,0,1,2,3,4,5,6,7
Predict,0,1,1,1,1,0,1,1
Actual,0,0,0,0,1,1,1,1


# [Problem 3] Creating a code to solve the regression problem

In [22]:
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
pd.set_option('precision', 2)

df = pd.read_csv('train.csv')

X = df[['GrLivArea', 'YearBuilt']].values
y = df['SalePrice'].values

X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)
X_train, X_test = scale(X_train, X_test)

## SGD Regressor

In [23]:
train_model(SGDRegressor(), X_train, X_test, y_train, y_test)

Score: 0.6985145402110488


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291
Predict,233110.0,232681.29,292491.46,216158.05,132187.88,236803.63,156711.15,131349.85,134106.13,274046.32,120799.34,222026.66,149435.3,145920.87,133885.74,134998.65,201303.57,236178.71,193011.7,223655.52,233817.23,63264.69,141099.81,194357.16,45818.61,88747.13,125797.25,231139.61,211551.31,110322.77,126515.42,174826.92,134278.11,150611.21,195410.72,148580.33,237474.54,234080.03,127740.96,134240.57,...,244889.62,124569.28,121591.38,322780.39,261979.65,162102.65,148108.0,230891.4,203345.39,181559.02,194579.98,202916.69,72687.7,244226.01,61754.54,165850.77,246127.32,121259.57,113138.4,244373.75,236316.79,229193.54,263880.96,149706.54,216695.67,140662.6,197508.25,112307.67,250748.58,172839.59,209453.78,152729.33,134278.11,237566.57,151007.23,153792.61,140839.44,222982.18,225933.4,117822.66
Actual,208500.0,307000.0,345000.0,279500.0,149000.0,306000.0,160000.0,130250.0,141000.0,239686.0,113000.0,385000.0,130000.0,140000.0,91000.0,127000.0,136500.0,193500.0,153500.0,245000.0,204750.0,83000.0,128950.0,198900.0,100000.0,115000.0,115000.0,217000.0,163990.0,100000.0,136000.0,153900.0,128000.0,150750.0,174000.0,143000.0,260000.0,130000.0,115000.0,122000.0,...,294000.0,127500.0,128900.0,381000.0,237000.0,119500.0,177500.0,325000.0,202500.0,179200.0,203000.0,208900.0,82500.0,147000.0,55000.0,125000.0,200000.0,128500.0,134900.0,235000.0,193000.0,232000.0,274300.0,157000.0,136000.0,137450.0,193879.0,133000.0,215000.0,140000.0,223000.0,136500.0,143750.0,186500.0,160000.0,149300.0,157900.0,175000.0,266500.0,142125.0


## Linear Regression

In [24]:
train_model(LinearRegression(), X_train, X_test, y_train, y_test)

Score: 0.6972898018961661


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291
Predict,233216.88,232815.72,292166.5,216474.18,132316.18,236976.95,156520.87,131605.28,133954.14,273822.31,119816.11,221693.56,149063.11,144758.65,134072.53,134574.65,200825.05,235418.28,193322.35,223935.11,233773.97,62746.53,140891.28,194921.72,44598.96,88697.0,124749.55,230058.85,211880.6,109881.59,126400.71,175164.21,134533.95,150325.8,195027.97,148503.91,237594.29,234203.15,127086.77,134472.64,...,244203.22,123820.36,121468.33,320525.57,261829.11,162037.29,147433.61,231088.44,203741.02,181793.74,195046.46,203339.85,72574.81,244315.26,61296.73,165706.97,246104.91,121524.35,112626.32,244317.38,236301.47,229332.08,263618.76,148489.65,214841.0,139639.17,197975.13,112644.81,250303.67,172157.86,208933.45,153485.96,134533.95,237565.22,151151.91,154564.72,140705.23,223074.65,224605.86,117585.65
Actual,208500.0,307000.0,345000.0,279500.0,149000.0,306000.0,160000.0,130250.0,141000.0,239686.0,113000.0,385000.0,130000.0,140000.0,91000.0,127000.0,136500.0,193500.0,153500.0,245000.0,204750.0,83000.0,128950.0,198900.0,100000.0,115000.0,115000.0,217000.0,163990.0,100000.0,136000.0,153900.0,128000.0,150750.0,174000.0,143000.0,260000.0,130000.0,115000.0,122000.0,...,294000.0,127500.0,128900.0,381000.0,237000.0,119500.0,177500.0,325000.0,202500.0,179200.0,203000.0,208900.0,82500.0,147000.0,55000.0,125000.0,200000.0,128500.0,134900.0,235000.0,193000.0,232000.0,274300.0,157000.0,136000.0,137450.0,193879.0,133000.0,215000.0,140000.0,223000.0,136500.0,143750.0,186500.0,160000.0,149300.0,157900.0,175000.0,266500.0,142125.0
