In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!unzip /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
!unzip /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
!unzip /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip

In [None]:
path = '/kaggle/working/'

In [None]:
import re
import random
from math import exp, log
from datetime import datetime
from operator import itemgetter

def clean(s):
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

def get_data_tsv(loc_dataset, opts):
    for e, line in enumerate(open(loc_dataset, 'rb')):
        if e > 0:
            r = line.decode('utf-8').strip().split('\t')
            id = r[0]
            
            if opts['clean']:
                try:
                    r[2] = clean(r[2])
                except:
                    r[1] = clean(r[1])
            
            if len(r) == 3:
                features = [(hash(f)%opts['D'], 1) for f in r[2].split()]
                label = int(r[1])
            else:
                features = [(hash(f)%opts['D'], 1) for f in r[1].split()]
                label = 1
            
            if opts['2grams']:
                for i in range(len(features)-1):
                    features.append(
                        (hash(str(features[i][0])+str(features[i+1][0]))%opts['D'], 1))
            yield label, id, features

In [None]:
def dot_product(features, weights):
    dotp = 0
    for f in features:
        dotp += weights[f[0]]*f[1]
    return dotp

def train_tron(loc_dataset, opts):
    start = datetime.now()
    
    if opts['random_init']:
        random.seed(3003)
        weight = [random.random()] * opts['D']
    else:
        weights = [0.] * opts['D']
    
    for pass_nr in range(opts['n_passes']):
        error_counter = 0
        for e, (label, id, features) in enumerate( \
            get_data_tsv(loc_dataset, opts)):
            dp = dot_product(features, weights) > 0.5
            error = label - dp
            if error != 0:
                error_counter += 1
                for index, value in features:
                    weights[index] += opts['learning_rate'] * error * log(1.+value)
        
        if error_counter == 0 or error_counter < opts['errors_satisfied']:
            break
    return weights

In [None]:
def test_tron(loc_dataset,weights,opts):
    start = datetime.now()
    preds = []
    error_counter = 0
    for e, (label, id, features) in enumerate( \
        get_data_tsv(loc_dataset,opts) ):

        dotp = dot_product(features, weights)
        dp = dotp > 0.5
        if dp > 0.5:
            preds.append( [id, 1, dotp ] )
        else:
            preds.append( [id, 0, dotp ] )
        
        if label - dp != 0:
            error_counter += 1

    max_dotp = max(preds,key=itemgetter(2))[2]
    min_dotp = min(preds,key=itemgetter(2))[2]
    for p in preds:
        p.append((p[2]-min_dotp)/float(max_dotp-min_dotp)) 
        
    print("Done testing in %s"%str(datetime.now()-start))
    return preds

In [None]:
opts = {}
opts["D"] = 2 ** 25
opts["learning_rate"] = 0.1
opts["n_passes"] = 80
opts["errors_satisfied"] = 0
opts["random_init"] = False
opts["clean"] = True
opts["2grams"] = True

%time 
weights = train_tron(path + "labeledTrainData.tsv",opts)

In [None]:
%time 
preds = test_tron(path + "testData.tsv",weights,opts)

In [None]:
with open("a_submit_perceptron.csv","wb") as outfile:
    outfile.write('"id","sentiment"\n'.encode('utf-8'))
    for p in sorted(preds):
        outfile.write("{},{}\n".format(p[0],p[3]).encode('utf-8'))