## Accuracy

In [2]:
import sklearn

print(sklearn.__version__)

1.0.2


In [3]:
import numpy as np
from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
    # fit( ) does not train 
    def fit(self, X , y=None):
        pass
    
    # predict( ) : Sex feature is 1, it will be 0 , otherwise 1. 
    def predict(self, X):
        pred = np.zeros( ( X.shape[0], 1 ))
        for i in range (X.shape[0]) :
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0
            else :
                pred[i] = 1
        
        return pred


In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Null processing function
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# reload original data, split trainig,test data.
titanic_df = pd.read_csv('./titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df= titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, \
                                                  test_size=0.2, random_state=0)

# predict/traing/rate with Dummy Classifica 
myclf = MyDummyClassifier()
myclf.fit(X_train ,y_train)

mypredictions = myclf.predict(X_test)
print('Accuracy of Dummy Classifier : {0:.4f}'.format(accuracy_score(y_test , mypredictions)))

Accuracy of Dummy Classifier : 0.7877


In [6]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

class MyFakeClassifier(BaseEstimator):
    def fit(self,X,y):
        pass
    
    # convert to 0 as the size of data by input.
    def predict(self,X):
        return np.zeros( (len(X), 1) , dtype=bool)

# load_digits( ): load MNIST data
digits = load_digits()

print(digits.data)
print("### digits.data.shape:", digits.data.shape)
print(digits.target)
print("### digits.target.shape:", digits.target.shape)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
### digits.data.shape: (1797, 64)
[0 1 2 ... 8 9 8]
### digits.target.shape: (1797,)


In [7]:
digits.target == 7

array([False, False, False, ..., False, False, False])

In [8]:
# If digits is 7, it's True and convert it to astype(int) = 1, otherwise it's false and astype(int) = 0.
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split( digits.data, y, random_state=11)

In [9]:
# check the unbalanced label data distribution. 
print('label test size  :', y_test.shape)
print('distribution of label 0 and 1 from test sets')
print(pd.Series(y_test).value_counts())

# train/predict/accuracy by Dummy Classifier
fakeclf = MyFakeClassifier()
fakeclf.fit(X_train , y_train)
fakepred = fakeclf.predict(X_test)
print('the accuracy for all prediction is 0:{:.3f}'.format(accuracy_score(y_test , fakepred)))

label test size  : (450,)
distribution of label 0 and 1 from test sets
0    405
1     45
dtype: int64
the accuracy for all prediction is 0:0.900


## Confusion Matrix

In [10]:
from sklearn.metrics import confusion_matrix

# print Confusion Matrix of the predicted values (fakepred) and the actual values (y_test).
confusion_matrix(y_test , fakepred)

array([[405,   0],
       [ 45,   0]])