In [2]:
import numpy as np
from pandas import DataFrame
import pandas as pd
import scipy as sp
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn import cross_validation

In [3]:
np.set_printoptions(precision=5,suppress=True)

Reading data:

In [4]:
dataset = sp.genfromtxt("data5/cleaned.txt", delimiter="\t")
dataset.shape

(712L, 26L)

In [5]:
dataset = np.delete(dataset,[25],1)
dataset.shape

(712L, 25L)

In [6]:
dataset = dataset[~np.isnan(dataset).any(axis=1)]
dataset.shape

(712L, 25L)

In [7]:
columns = {"GP":0, "GS":1, "MIN":2, "FGM":3,"FGA":4,"FG%":5,"3PM":6,"3PA":7,"3P%":8,"FTM":9,"FTA":10,"FT%":11,"OFF":12,"DEF":13,
              "TRB":14,"AST":15,"STL":16,"BLK":17,"PF":18,"TOV":19,"PTS":20,"YR":21,"W":22,"H":23} 

# Labels

In [8]:
def np_labeliser(data,col):
    labels = data[:,col]
    return labels

In [9]:
labels = np_labeliser(dataset,22)
labels[:10]

array([ 4.,  4.,  4.,  5.,  5.,  4.,  1.,  5.,  3.,  1.])

# Feature Selection

In [10]:
def np_featuriser(dataset, feature_list):
    
    features = np.delete(dataset,feature_list,1)
    return features

In [11]:
feature_list = [22]
print len(dataset[0])
features = np_featuriser(dataset, feature_list)
print len(features[0])

25
24


In [12]:
def sup_features(usp_list,x):
    remove = []
    j = 0
    for i in usp_list:
        if i == False:
            remove.append(j)
            if x=="vt":
                print "%s. feature name: %s" %(j, columns.keys()[columns.values().index(j)])
        elif x == "uni":
            print "%s. feature name: %s" %(j, columns.keys()[columns.values().index(j)])
        j = j+1  

    return remove

In [13]:
def feature_selection(clf, features, labels, domain):
    none = features
    #print none[0]
    domain = np_featuriser(features, domain)
    #print domain[0]
    clf = Pipeline([('feature_selection',SelectPercentile(f_classif, percentile=20)),
  ('classification', clf)])
    clf.fit(features, labels)
    print "\nUnivariate - valuable features \n"
    uni = sup_features(clf.named_steps['feature_selection'].get_support(),"uni")
    uni = np_featuriser(features, uni)
    #print uni[0]
    clf = Pipeline([('feature_selection',VarianceThreshold(threshold=(.8 * (1 - .8)))),
  ('classification', clf)])
    clf.fit(features, labels)
    print "\nVariance Threshold - removed \n"
    v_th = sup_features(clf.named_steps['feature_selection'].get_support(), "vt")
    #print v_th[0]
    v_th = np_featuriser(features, v_th)
    return none, domain, uni, v_th  

domain = [columns["GP"],columns["GS"],columns["MIN"],columns["FG%"],
     columns["3P%"],columns["FT%"],columns["PTS"],columns["YR"],columns['3PM'],columns['FTM'],columns['FGM']]


In [14]:
def cross_val(clf, f, l, name):
    print "\nFeature selection: %s" %name
    scores = cross_validation.cross_val_score(clf, f, l, cv=10)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [15]:
def clf_all(clf, features, labels, domain):
    none, domain, uni, vth = feature_selection(clf, features, labels, domain)
    
    cross_val(clf, none, labels, "None")
    print "Number of features left: %s" %none.shape[1]
    cross_val(clf, domain, labels, "Domain")
    print "Number of features left: %s" %domain.shape[1]
    cross_val(clf, uni, labels, "Univariate")
    print "Number of features left: %s" %uni.shape[1]
    cross_val(clf, vth, labels, "Variance Threshold")
    print "Number of features left: %s" %vth.shape[1]

# All Results

In [16]:
#feature_list = [columns["GP"],columns["GS"],columns["MIN"],columns["PTS"],columns["FG%"],
#     columns["3P%"],columns["FT%"],columns["YR"],columns["POS"],columns['3PM'],columns['FTM'],columns['FGM']]
#train_features_nb, test_features_nb, val_features_nb = np_featuriser(train_set_nb, test_set_nb, val_set_nb, feature_list)

In [17]:
clf_all(GaussianNB(), features, labels, domain)


Univariate - valuable features 

7. feature name: 3PA
8. feature name: 3P%
15. feature name: AST
22. feature name: W
23. feature name: H

Variance Threshold - removed 

5. feature name: FG%
8. feature name: 3P%
11. feature name: FT%

Feature selection: None
Accuracy: 0.70 (+/- 0.15)
Number of features left: 24

Feature selection: Domain
Accuracy: 0.74 (+/- 0.09)
Number of features left: 13

Feature selection: Univariate
Accuracy: 0.77 (+/- 0.11)
Number of features left: 5

Feature selection: Variance Threshold
Accuracy: 0.71 (+/- 0.11)
Number of features left: 21


In [18]:
svm = SVC()
svm = svm.set_params(kernel='linear')
clf_all(svm, features, labels, domain) 


Univariate - valuable features 

7. feature name: 3PA
8. feature name: 3P%
15. feature name: AST
22. feature name: W
23. feature name: H

Variance Threshold - removed 

5. feature name: FG%
8. feature name: 3P%
11. feature name: FT%

Feature selection: None
Accuracy: 0.82 (+/- 0.12)
Number of features left: 24

Feature selection: Domain
Accuracy: 0.82 (+/- 0.09)
Number of features left: 13

Feature selection: Univariate
Accuracy: 0.81 (+/- 0.09)
Number of features left: 5

Feature selection: Variance Threshold
Accuracy: 0.82 (+/- 0.10)
Number of features left: 21


In [19]:
logreg = linear_model.LogisticRegression(C=1e5)
clf_all(logreg, features, labels, domain)


Univariate - valuable features 

7. feature name: 3PA
8. feature name: 3P%
15. feature name: AST
22. feature name: W
23. feature name: H

Variance Threshold - removed 

5. feature name: FG%
8. feature name: 3P%
11. feature name: FT%

Feature selection: None
Accuracy: 0.69 (+/- 0.15)
Number of features left: 24

Feature selection: Domain
Accuracy: 0.72 (+/- 0.11)
Number of features left: 13

Feature selection: Univariate
Accuracy: 0.72 (+/- 0.08)
Number of features left: 5

Feature selection: Variance Threshold
Accuracy: 0.70 (+/- 0.12)
Number of features left: 21
