In [None]:
import numpy as np 
import pandas as pd 
from scipy.sparse import csr_matrix
from tqdm import tqdm, trange
from sklearn.datasets import load_svmlight_file


import os
print(os.listdir("../input/movie-ratings"))


In [None]:
training_data, training_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.train')
testing_data, testing_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.test')
testing_data = csr_matrix((testing_data.data, testing_data.indices, testing_data.indptr), shape=(testing_data.shape[0], training_data.shape[1]))

In [None]:
print(training_data.shape, training_labels.shape)
print(testing_data.shape, testing_labels.shape)

In [None]:
class Softmax:
    def __init__(self, learning_rate=0.1, reg_const=0.05, num_iters=10, batch_size=200):
        self.W = None
        self.learning_rate =  learning_rate
        self.reg_const =  reg_const
        self.num_iters = num_iters
        self.batch_size = batch_size
        
    def loss(self,X_batch, y_batch):
        
        num_train = X_batch.shape[0]
        num_classes = self.W.shape[1]
        
        scores = X_batch.dot(self.W)
        shift_scores = scores - np.max(scores, axis = 1).reshape(-1,1)
        softmax_output = np.exp(shift_scores)/np.sum(np.exp(shift_scores), axis = 1).reshape(-1,1)
        loss = -np.sum(np.log(softmax_output[range(num_train), y_batch]))
        loss += 0.5 * self.reg_const * np.sum(self.W*self.W)
        
        dS = softmax_output.copy()
        dS[range(num_train), y_batch] += -1
        dW = (X_batch.T).dot(dS)
        dW = dW/num_train + self.reg_const * self.W
        
        return loss, dW
    
    def fit(self, X, y, **kwargs):
        if kwargs:
            self.learning_rate =  kwargs['learning_rate']
            self.reg_const =  kwargs['reg_const']
            self.num_iters = kwargs['num_iters']
            self.batch_size = kwargs['batch_size']
        
        num_train, dim = X.shape
        num_classes = 2
        if self.W is None:
    
          self.W = 0.001 * np.random.randn(int(dim), int(num_classes))

        for it in trange(self.num_iters, mininterval=5):
            batch_idx = np.random.choice(num_train, self.batch_size, replace = True)
            X_batch =  X[batch_idx]
            y_batch = y[batch_idx]
        
            loss, grad = self.loss(X_batch, y_batch.astype(int))
        
            self.W += - self.learning_rate * grad
    
    def get_params(self, deep = False):
        return {'learning_rate': self.learning_rate, 'reg_const': self.reg_const, 
                'num_iters':self.num_iters, 'batch_size':self.batch_size}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def predict(self, X):
        scores = X.dot(self.W)
        y_pred = np.argmax(scores, axis =1)
        return y_pred
    
    def score(self, X, y):
        assert X.shape[0] == y.shape[0]
        correct = sum(1 for actual, prediction in zip(y, self.predict(X)) if np.sign(actual) == np.sign(prediction))
        return correct/X.shape[0]

In [None]:
softmax = Softmax()
softmax.fit(training_data, training_labels, learning_rate=0.01, reg_const=0.05, num_iters=20000, batch_size=200)

In [None]:
softmax.score(testing_data, testing_labels)

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
clf = GridSearchCV(Softmax(), param_grid={'learning_rate':[0.0001, 0.0005, 0.001], 'reg_const':[0.0001,0.001,0.01,0.1], 'num_iters':[20000], 'batch_size':[200]}, cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(testing_data, testing_labels)

In [None]:
print(clf.best_params_)
print(clf.best_score_)

In [None]:
cross_val_score(Softmax(), training_data, y=training_labels, scoring='accuracy', 
                fit_params=clf.best_params_, cv=5, n_jobs=-1)

In [None]:
clf.best_estimator_.score(testing_data, testing_labels)

In [None]:
softmax = clf.best_estimator_
eval_data, _ = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon')
eval_data = csr_matrix((eval_data.data, eval_data.indices, eval_data.indptr), shape=(eval_data.shape[0], training_data.shape[1]))
submission_pred = softmax.predict(eval_data)

In [None]:
with open('submission.csv', 'w') as submission:
    with open('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon.id', 'r') as example_ids:
        submission.write('example_id,label\n')
        for example_id, label in zip(example_ids, submission_pred):
            submission.write('{},{}\n'.format(example_id.strip(), int(label)))