In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
X = joblib.load('Pickles/Sparse_Basic+Temporal+Recency/train.pkl')
X

<8483920x217691 sparse matrix of type '<type 'numpy.float64'>'
	with 154318299 stored elements in COOrdinate format>

In [3]:
y = joblib.load('Pickles/labels.pkl')
y.shape

(8483920L,)

In [4]:
X_test = joblib.load('Pickles/Sparse_Basic+Temporal+Recency/test.pkl')
X_test

<508912x217691 sparse matrix of type '<type 'numpy.float64'>'
	with 9340125 stored elements in COOrdinate format>

### LR1

In [5]:
# LR penalty L2 C10
lr = LogisticRegression(C=10., random_state=0, n_jobs=-1, verbose=10)
scaler = StandardScaler(with_mean=False, copy=False)
clf = Pipeline([('scaler', scaler), ('logreg', lr)])

In [6]:
clf.fit(X, y)

[LibLinear]



Pipeline(steps=[('scaler', StandardScaler(copy=False, with_mean=False, with_std=True)), ('logreg', LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=10, warm_start=False))])

In [7]:
joblib.dump(clf, 'Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L2C10/model.pkl', compress=3)

['Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L2C10/model.pkl']

In [8]:
probas = clf.predict_proba(X_test)
probas

array([[ 0.148954  ,  0.851046  ],
       [ 0.1383367 ,  0.8616633 ],
       [ 0.00929406,  0.99070594],
       ..., 
       [ 0.24137058,  0.75862942],
       [ 0.04771305,  0.95228695],
       [ 0.03362875,  0.96637125]])

In [9]:
ones = []
zeros = []
for zero_prob, one_prob in probas:
    if one_prob > zero_prob:
        ones.append(one_prob)
    else:
        zeros.append(one_prob)
        
print len(ones), len(zeros)

import csv
with open('Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L2C10/algebra_2008_2009_submission.txt', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t')
    spamwriter.writerow(['Row', 'Correct First Attempt'])
    for index, (zero_prob, one_prob) in enumerate(probas):
        spamwriter.writerow([index+1, float(one_prob)])

484686 24226


### LR2 

In [10]:
X = joblib.load('Pickles/Sparse_Basic+Temporal+Recency/train.pkl')
X

<8483920x217691 sparse matrix of type '<type 'numpy.float64'>'
	with 154318299 stored elements in COOrdinate format>

In [11]:
y = joblib.load('Pickles/labels.pkl')
y.shape

(8483920L,)

In [12]:
X_test = joblib.load('Pickles/Sparse_Basic+Temporal+Recency/test.pkl')
X_test

<508912x217691 sparse matrix of type '<type 'numpy.float64'>'
	with 9340125 stored elements in COOrdinate format>

In [13]:
# LR penalty L2 C1
lr = LogisticRegression(C=1., random_state=0, n_jobs=-1, verbose=10)
scaler = StandardScaler(with_mean=False, copy=False)
clf = Pipeline([('scaler', scaler), ('logreg', lr)])

In [14]:
clf.fit(X, y)

[LibLinear]

Pipeline(steps=[('scaler', StandardScaler(copy=False, with_mean=False, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=10, warm_start=False))])

In [15]:
joblib.dump(clf, 'Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L2C1/model.pkl', compress=3)

['Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L2C1/model.pkl']

In [16]:
probas = clf.predict_proba(X_test)
probas

array([[ 0.14916571,  0.85083429],
       [ 0.13860786,  0.86139214],
       [ 0.0070964 ,  0.9929036 ],
       ..., 
       [ 0.24340679,  0.75659321],
       [ 0.04771555,  0.95228445],
       [ 0.03381186,  0.96618814]])

In [17]:
ones = []
zeros = []
for zero_prob, one_prob in probas:
    if one_prob > zero_prob:
        ones.append(one_prob)
    else:
        zeros.append(one_prob)
        
print len(ones), len(zeros)

import csv
with open('Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L2C1/algebra_2008_2009_submission.txt', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t')
    spamwriter.writerow(['Row', 'Correct First Attempt'])
    for index, (zero_prob, one_prob) in enumerate(probas):
        spamwriter.writerow([index+1, float(one_prob)])

484650 24262


### LR3

In [2]:
X = joblib.load('Pickles/Sparse_Basic+Temporal+Recency/train.pkl')
X

<8483920x217691 sparse matrix of type '<type 'numpy.float64'>'
	with 154318299 stored elements in COOrdinate format>

In [3]:
y = joblib.load('Pickles/labels.pkl')
y.shape

(8483920L,)

In [4]:
X_test = joblib.load('Pickles/Sparse_Basic+Temporal+Recency/test.pkl')
X_test

<508912x217691 sparse matrix of type '<type 'numpy.float64'>'
	with 9340125 stored elements in COOrdinate format>

In [5]:
# LR penalty L1 C10
lr = LogisticRegression(C=0.001, random_state=0, n_jobs=-1, verbose=10, penalty='l1')
scaler = StandardScaler(with_mean=False, copy=False)
clf = Pipeline([('scaler', scaler), ('logreg', lr)])

In [6]:
clf.fit(X, y)

[LibLinear]

Pipeline(steps=[('scaler', StandardScaler(copy=False, with_mean=False, with_std=True)), ('logreg', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=10, warm_start=False))])

In [7]:
joblib.dump(clf, 'Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L1C001/model.pkl', compress=3)

['Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L1C001/model.pkl']

In [8]:
probas = clf.predict_proba(X_test)
probas

array([[ 0.14287717,  0.85712283],
       [ 0.13367135,  0.86632865],
       [ 0.00593404,  0.99406596],
       ..., 
       [ 0.27948154,  0.72051846],
       [ 0.05949028,  0.94050972],
       [ 0.04949227,  0.95050773]])

In [9]:
ones = []
zeros = []
for zero_prob, one_prob in probas:
    if one_prob > zero_prob:
        ones.append(one_prob)
    else:
        zeros.append(one_prob)
        
print len(ones), len(zeros)

import csv
with open('Pickles/Sparse_Basic+Temporal+Recency/TrainedCLFs/LogReg/L1C001/algebra_2008_2009_submission.txt', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t')
    spamwriter.writerow(['Row', 'Correct First Attempt'])
    for index, (zero_prob, one_prob) in enumerate(probas):
        spamwriter.writerow([index+1, float(one_prob)])

486814 22098
