In [None]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss, confusion_matrix
import matplotlib.pyplot as plt
import random
import seaborn as sns
import cv2 as cv

os.chdir("/kaggle/input")
os.listdir()
from padhai import MPNeuron, Perceptron, PerceptronWithSigmoid

np.random.seed(100)
LEVEL = 'level_3'
sns.set()

In [None]:
class SigmoidNeuron(object):
    def __init__(self):
        self.w = None
        self.b = None
    
    def perceptron(self, x):
        return np.dot(x, self.w.T) + self.b
    
    def sigmoid(self, x):
        return 1.0/(1.0 + np.exp(-x))

    def grad_w_mse(self, x, y):
        y_pred = self.sigmoid(self.perceptron(x))
        return (y_pred - y) * y_pred * (1 - y_pred) * x

    def grad_b_mse(self, x, y):
        y_pred = self.sigmoid(self.perceptron(x))
        return (y_pred - y) * y_pred * (1 - y_pred)

    def grad_w_ce(self, x, y):
        y_pred = self.sigmoid(self.perceptron(x))
        if y == 0:
            return y_pred * x
        elif y == 1:
            return -1 * (1 - y_pred) * x
        else:
            raise ValueError("y should be 0 or 1")
    
    def grad_b_ce(self, x, y):
        y_pred = self.sigmoid(self.perceptron(x))
        if y == 0:
            return y_pred 
        elif y == 1:
            return -1 * (1 - y_pred)
        else:
            raise ValueError("y should be 0 or 1")

    def fit(self, X, Y, epochs=1, learning_rate=1, initialise=True, loss_fn="mse", display_loss=False):
    
        # initialise w, b
        if initialise:
            #np.random.seed(100)
            #self.w = np.random.randn(1, X.shape[1])
            self.w = np.zeros((1, X.shape[1]))
            self.b = 0
        if display_loss:
            loss = {}
    
        for i in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
            dw = 0
            db = 0
            for x, y in zip(X, Y):
                if loss_fn == "mse":
                    dw += self.grad_w_mse(x, y)
                    db += self.grad_b_mse(x, y) 
                elif loss_fn == "ce":
                    dw += self.grad_w_ce(x, y)
                    db += self.grad_b_ce(x, y)
            self.w -= learning_rate * dw/X.shape[0]
            self.b -= learning_rate * db/X.shape[0]
        
            if display_loss:
                Y_pred = self.sigmoid(self.perceptron(X))
            if loss_fn == "mse":
                loss[i] = mean_squared_error(Y, Y_pred)
            elif loss_fn == "ce":
                loss[i] = log_loss(Y, Y_pred)
    
        if display_loss:
            plt.plot(loss.values())
            plt.xlabel('Epochs')
            if loss_fn == "mse":
                plt.ylabel('Mean Squared Error')
            elif loss_fn == "ce":
                plt.ylabel('Log Loss')
            plt.show()
            
    def predict(self, X):
        Y_pred = []
        for x in X:
            y_pred = self.sigmoid(self.perceptron(x))
            Y_pred.append(y_pred)
        return np.array(Y_pred)

In [None]:
def read_all(folder_path, key_prefix=""):
    '''
    It returns a dictionary with 'file names' as keys and 'flattened image arrays' as values.
    '''
    print("Reading:")
    images = {}
    files = os.listdir(folder_path)
    for i, file_name in tqdm_notebook(enumerate(files), total=len(files)):
        file_path = os.path.join(folder_path, file_name)
        image_index = key_prefix + file_name[:-4]
        image = Image.open(file_path)
        image = image.convert("L")
        images[image_index] = np.array(image.copy()).flatten()
        image.close()
    return images

In [None]:
languages = ['ta', 'hi', 'en']

images_train = read_all("level_3_train/"+LEVEL+"/"+"background", key_prefix='bgr_') # change the path
for language in languages:
    images_train.update(read_all("level_3_train/"+LEVEL+"/"+language, key_prefix=language+"_" ))
print(len(images_train))

images_test = read_all("level_3_test/kaggle_"+LEVEL, key_prefix='') # change the path
print(len(images_test))

In [None]:
X_train = []
Y_train = []
images_train_list = list(images_train.items())
random.shuffle(images_train_list)
images_train = dict(images_train_list)
for key, value in images_train.items():
    X_train.append(value)
    if key[:4] == "bgr_":
        Y_train.append(0)
    else:
        Y_train.append(1)

ID_test = []
X_test = []
for key, value in images_test.items():
    ID_test.append(int(key))
    X_test.append(value)
    
X_train = np.array(X_train)
Y_train = np.array(Y_train)
#X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=5, stratify=Y_train)
X_test = np.array(X_test)

print(X_train.shape, Y_train.shape)
print(X_test.shape)

In [None]:
# Applying a threshold
tr = 5
X_train_app = np.zeros(X_train.shape)
#X_val_app = np.zeros(X_val.shape)
X_test_app = np.zeros(X_test.shape)

for i in range(X_train.shape[0]):    
    X_train_app[i, :] = np.array(cv.threshold(X_train[i].reshape(64, 64),tr,255,cv.THRESH_TRUNC)[1]).flatten()

#for i in range(X_val.shape[0]):    
#    X_val_app[i, :] = np.array(cv.threshold(X_val[i].reshape(64, 64),tr,255,cv.THRESH_TRUNC)[1]).flatten()

for i in range(X_test.shape[0]):    
    X_test_app[i, :] = np.array(cv.threshold(X_test[i].reshape(64, 64),tr,255,cv.THRESH_TRUNC)[1]).flatten()

In [None]:
scaler = StandardScaler()
X_scaled_train_app = scaler.fit_transform(X_train_app)
X_scaled_test_app = scaler.transform(X_test_app)

In [None]:
plt.imshow(X_scaled_train_app[36].reshape(64, 64), cmap="gray")

In [None]:
class Perceptron_m:
    
    def __init__(self):
        self.w = None
        self.b = None
        
    def perceptron(self, x):
        return np.sum(self.w * x) + self.b
    
    def fit(self, X, Y, epochs=10, learning_rate=0.01, log=False, display_plot=False):
        # initialise the weights and bias
        #self.w = np.random.randn(1, X.shape[1])
        self.w = np.zeros((1, X.shape[1]))
        self.b = 0
        if log or display_plot: 
            accuracy = {}
        for i in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
            for x, y in zip(X, Y):
                result = self.perceptron(x)
                if y == 1 and result < 0:
                    self.w += learning_rate*x
                    self.b += learning_rate
                elif y == 0 and result >= 0:
                    self.w -= learning_rate*x
                    self.b -= learning_rate
            if log or display_plot:
                Y_pred = self.predict(X)
                accuracy[i] = accuracy_score(Y, Y_pred)
        if log:
            with open('perceptron_accuracy.json', 'w') as fp:
                json.dump(accuracy, fp)
        if display_plot:
            epochs_, accuracy_ = zip(*accuracy.items())
            plt.plot(epochs_, accuracy_)
            plt.xlabel("Epochs")
            plt.ylabel("Train Accuracy")
            plt.show()
                    
    def predict(self, X):
        Y = []
        for x in X:
            result = self.perceptron(x)
            Y.append(int(result>=0))
        return np.array(Y)

In [None]:
per = Perceptron_m()
per.fit(X_scaled_train_app, Y_train, epochs=10, learning_rate=1, log=False, display_plot=True)

In [None]:
per1 = Perceptron()
per1.fit(X_scaled_train_app, Y_train, epochs=6, learning_rate=1e-2, log=False, display_plot=True)

In [None]:
def print_accuracy(sn, th):
    Y_pred_train = sn.predict(X_scaled_train_app)
    Y_pred_binarised_train = (Y_pred_train >= th).astype("int").ravel()
    accuracy_train = accuracy_score(Y_pred_binarised_train, Y_train)
    print("Train Accuracy : ", accuracy_train)
    print("-"*50)

In [None]:
t = 0.5
print_accuracy(per1, t)
print_accuracy(per, t)

In [None]:
os.chdir('..')
os.chdir("working")

Y_pred_test = per.predict(X_scaled_test_app)
Y_pred_binarised_test = (Y_pred_test >= 0.5).astype("int").ravel()

In [None]:
submission = {}
submission['ImageId'] = ID_test
submission['Class'] = Y_pred_binarised_test

submission = pd.DataFrame(submission)
submission = submission[['ImageId', 'Class']]
submission = submission.sort_values(['ImageId'])
submission.to_csv("submission.csv", index=False)