In [97]:
#importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr, kendalltau
import math
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
import cPickle
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
import scipy.stats as stats
from sklearn.metrics import accuracy_score

In [98]:
def save_classifier(filename, clf):
    with open(filename, 'w') as fid:
        cPickle.dump(clf, fid)
    
def load_classifier(filename):
    with open(filename, 'r') as fid:
        clf = cPickle.load(fid)
    return clf    

In [99]:
#Data import
Data_file = 'data/data.csv'
Label_file = 'data/labels.txt'

print "Reading the data: "
X = pd.read_csv(Data_file, delimiter = ';', header = None)
X = shuffle(X)
y = pd.read_csv(Label_file, sep = ' ', header = None)
y.columns = ['labels']
X.columns = ['fea1', 'fea2', 'fea3', 'fea4']
print
print "Printing the features: "
print X.head()
print
print "Printing the labels: "
print y.head()
print
print "Dimensions: "
print "Features"
print X.shape
print
print "Labels"
print y.shape

Reading the data: 

Printing the features: 
           fea1        fea2   fea3   fea4
647  -80.194066  106.320196 -13.36  48.72
29  -106.548483   93.939088 -13.36  48.72
215   77.078661  -78.574431 -13.36  48.72
573   71.086529   99.097405 -13.36  48.72
842   -5.011151    4.089955 -13.36  48.72

Printing the labels: 
   labels
0       1
1       1
2       1
3       0
4       1

Dimensions: 
Features
(1000, 4)

Labels
(1000, 1)


In [100]:
#Data exploration
print "Feature Summary: "
print X.describe()
print
print "Labels Summary: "
print y.ix[:,0].value_counts()

Feature Summary: 
              fea1         fea2          fea3          fea4
count  1000.000000  1000.000000  1.000000e+03  1.000000e+03
mean      1.191948     0.122606 -1.336000e+01  4.872000e+01
std      71.163924    71.453898  1.368479e-13  6.753534e-13
min    -119.948256  -119.744994 -1.336000e+01  4.872000e+01
25%     -56.495719   -61.059189 -1.336000e+01  4.872000e+01
50%       0.183716     1.094511 -1.336000e+01  4.872000e+01
75%      63.873119    60.266393 -1.336000e+01  4.872000e+01
max     119.997661   119.302587 -1.336000e+01  4.872000e+01

Labels Summary: 
1    687
0    313
Name: labels, dtype: int64


In [101]:
#Plotting the data variables
for num, i in enumerate(X.columns):
    plt.plot(X[i])
    plt.savefig('plot-col'+str(num)+'.png')
    plt.close()

print "Printing correlations: "
print X.corr()
print  
print "Printing correlations along p-values: "
print pearsonr(X.ix[:,0], X.ix[:,1])
print pearsonr(X.ix[:,0], X.ix[:,2])
print pearsonr(X.ix[:,0], X.ix[:,3])
print pearsonr(X.ix[:,1], X.ix[:,2])
print pearsonr(X.ix[:,1], X.ix[:,3])
print pearsonr(X.ix[:,2], X.ix[:,3])
#no correlation

Printing correlations: 
              fea1          fea2          fea3          fea4
fea1  1.000000e+00 -5.979175e-02  9.216368e-18  8.344986e-18
fea2 -5.979175e-02  1.000000e+00  1.885409e-17 -1.528174e-17
fea3  9.216368e-18  1.885409e-17  1.000000e+00 -1.000000e+00
fea4  8.344986e-18 -1.528174e-17 -1.000000e+00  1.000000e+00

Printing correlations along p-values: 
(-0.059791749008238609, 0.058743584315117066)
(-3.676168339633808e-17, 1.0)
(3.676168339633808e-17, 1.0)
(1.5918477011343944e-17, 1.0)
(-1.5918477011343944e-17, 1.0)
(-1.0, 0.0)


In [102]:
#Changing the features:
#Normalizing the features

print "Normalizing the features: "
X_normalize = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
print X_normalize.head()
print

'''
#Changing to logarithm
print "Changing to the log: "
X_log = X.apply(np.log)
print X_log.head()
print "After filling NAs"
X_log = X_log.fillna(X_log.mean())
print X_log.head()
print 
'''
#Changing to square
print "Changing to square: "
X_square = X.apply(np.square)
#X_square = X_square.ix[:,0:2]
print X_square.head()

for num, i in enumerate(X_square.columns):
    plt.plot(X_square[i])
    plt.savefig('squared_plot-col'+str(num)+'.png')
    plt.close()

#X = X_square.ix[:,0:2]
X = X_normalize.ix[:,0:2]
    
#Split into train and test- using the square of the features
frame = pd.concat([y, X], axis = 1)
train, test = train_test_split(frame, test_size=0.2)
#X_train, y_train = train.ix[:,1:], train['labels']
#X_test, y_test = test.ix[:,1:], test['labels']

X_train, y_train = frame.ix[:,1:], frame['labels']
X_test, y_test = frame.ix[:,1:], frame['labels']

Normalizing the features: 
         fea1      fea2  fea3  fea4
647 -1.144214  1.486983   1.0  -1.0
29  -1.514733  1.313622   1.0  -1.0
215  1.066899 -1.101919   1.0  -1.0
573  0.982655  1.385849   1.0  -1.0
842 -0.087210  0.055551   1.0  -1.0

Changing to square: 
             fea1          fea2      fea3       fea4
647   6431.088274  11303.984048  178.4896  2373.6384
29   11352.579149   8824.552234  178.4896  2373.6384
215   5941.119972   6173.941162  178.4896  2373.6384
573   5053.294574   9820.295602  178.4896  2373.6384
842     25.111636     16.727731  178.4896  2373.6384


In [103]:
#Implementing Logistic Regression from Scratch
class LogisticRegressor(object):
    def __init__(self, lbda):
        #lambda for regularization
        self.lbda = lbda
    
    def sigmoid_func(self, theta, x):
        return float(1) / (1 + math.e**(-x.dot(theta)))
    
    def cost_func(self, theta, x, y):
        m = x.shape[0]
        hypo_func = self.sigmoid_func(theta, x)
        term1 = y*np.log(hypo_func)
        term2 = (1-y) * np.log(1-hypo_func)
        final = (-term1-term2) * float(1/m)
        regularized_factor = float(self.lbda/(2*m)) * theta.T.dot(theta)
        return final + regularized_factor
    
    def gradient(self, theta, x, y):
        m = x.shape[0]
        temp = (self.sigmoid_func(theta, x) - np.squeeze(y))* float(1/m)
        regularized_factor = float(self.lbda/m) * theta
        return temp.T.dot(x) + regularized_factor
    
    def grad_desc(self, theta_values, X, y, alpha=.001, converge_change=.01):
        #normalize
        X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
        cost_iter = []
        #compute initial cost function
        cost = self.cost_func(theta_values, X, y)
        #tracking cost at each iterations
        cost_iter.append([0, cost])
        change_cost = 1
        i = 1
        try:
            while change_cost > converge_change:
                old_cost = cost
                #theta value change
                theta_values = theta_values - (alpha * self.gradient(theta_values, X, y))
                #recompute cost function
                cost = self.cost_func(theta_values, X, y)
                cost_iter.append([i, cost])
                change_cost = old_cost - cost
                i+=1
        except:
            pass
        return theta_values, []
        #list(np.array(cost_iter))
    
    def pred_values(self, theta, X, hard=True):
        #normalize
        X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
        pred_prob = self.sigmoid_func(theta, X)
        pred_value = np.where(pred_prob >= .5, 1, 0)
        if hard:
            return pred_value
        return pred_prob

In [104]:
lreg = LogisticRegressor(0.1)
shape = X_train.shape[1]
betas = np.zeros(shape)
theta_values, cost_iter = lreg.grad_desc(betas, X_train, y_train)
print theta_values
predicted_y = lreg.pred_values(theta_values, X_test)
print classification_report(y_test, predicted_y)    

fea1    0.0
fea2    0.0
dtype: float64
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       313
          1       0.69      1.00      0.81       687

avg / total       0.47      0.69      0.56      1000



In [105]:
#Logistic Regressor: scikit-learn
#lreg_sci = LogisticRegression(penalty='l2')
lreg_sci = SGDClassifier(loss='log', penalty='l2', )
lreg_sci.fit(X_train, y_train)
#print lreg_sci
save_classifier('Classifier_logit.pkl', lreg_sci)
#To extract the classifier uncomment the line below
#lreg = extract_classifier('SGDClassi(fier.pkl')

pred = lreg_sci.predict(X_test)
print classification_report(y_test, pred)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       313
          1       0.62      0.74      0.67       687

avg / total       0.42      0.51      0.46      1000

