## READ FILE

In [1]:
import numpy
import pandas
from matplotlib import pyplot
filename= 'pima-indians-diabetes.csv'
#raw_data= open(filename,'rb')

names=['preg','plas','pres','skin', 'test', 'mass','pedi','age','class']
data=pandas.read_csv(filename,names=names)
shape=data.shape
print(shape)

(768, 9)


## Feature Selection

In [2]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

array=data.values
X=array[:,0:8]
Y=array[:,8]

#feature extraction
test=SelectKBest(score_func=chi2,k=4)
fit=test.fit(X,Y)

#summarize success

numpy.set_printoptions(precision=3)
print(fit.scores_)
features=fit.transform(X)
print(names)
# summarizes selected features
print(features[0:5,:])


[  111.52   1411.887    17.605    53.108  2175.565   127.669     5.393
   181.304]
['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
[[ 148.     0.    33.6   50. ]
 [  85.     0.    26.6   31. ]
 [ 183.     0.    23.3   32. ]
 [  89.    94.    28.1   21. ]
 [ 137.   168.    43.1   33. ]]


## Feature extraction




In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

model= LogisticRegression()
rfe= RFE(model,4)
model_fit=rfe.fit(X,Y)
print("Num features:",model_fit.n_features_)
print("Selected features:",model_fit.support_)
print("Feature Ranking:",model_fit.ranking_)

Num features: 4
Selected features: [ True  True False False False  True  True False]
Feature Ranking: [1 1 2 4 5 1 1 3]


## Principle Component Analysis

In [4]:
from sklearn.decomposition import PCA

pca=PCA(n_components=3)
fit=pca.fit(X)

print("Explained Variance",fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance [ 0.889  0.062  0.026]
[[ -2.022e-03   9.781e-02   1.609e-02   6.076e-02   9.931e-01   1.401e-02
    5.372e-04  -3.565e-03]
 [ -2.265e-02  -9.722e-01  -1.419e-01   5.786e-02   9.463e-02  -4.697e-02
   -8.168e-04  -1.402e-01]
 [ -2.246e-02   1.434e-01  -9.225e-01  -3.070e-01   2.098e-02  -1.324e-01
   -6.400e-04  -1.255e-01]]


## Feature Importance with extra trees Calssifier

In [5]:
from sklearn.ensemble import ExtraTreesClassifier

# feature extraction
model= ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_)
print(names)

[ 0.106  0.226  0.095  0.085  0.071  0.143  0.124  0.149]
['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']


In [12]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

test_size=0.2 

X_train,X_test,Y_train,Y_test=cross_validation.train_test_split(X,Y,test_size=0.2,random_state=6)
model=LogisticRegression()
model.fit(X_train,Y_train)
result= model.score(X_test,Y_test)
print("Accuracy:", result*100.0)

Accuracy: 80.5194805195


In [21]:
num_folds=12
num_instances= len(X_train)
seed=10
kfold= cross_validation.KFold(n=num_instances,n_folds=num_folds,random_state=seed)
model=LogisticRegression()
model.fit(X_train,Y_train)

results=cross_validation.cross_val_score(model,X_train,Y_train,cv=kfold)
print("Accuracy:", results.mean()*100.0,results.std()*100.0)

Accuracy: 76.0589492207 4.23906261875


In [19]:
num_instances= len(X_train)

loocv = cross_validation.LeaveOneOut(n=num_instances)
model.fit(X_train,Y_train)
results=cross_validation.cross_val_score(model,X_train,Y_train,cv=loocv)
print("Accuracy:", results.mean()*100.0,results.std()*100.0)

Accuracy: 76.7100977199 42.2678445132


In [22]:
import sklearn


In [27]:
from sklearn.metrics import confusion_matrix

test_size=0.33 
seed = 7
X_train,X_test,Y_train,Y_test=cross_validation.train_test_split(X,Y,test_size=test_size,random_state=seed)
model=LogisticRegression()
model.fit(X_train,Y_train)
predicated = model.predict(X_test)
matrix = confusion_matrix(Y_test,predicated)
print(matrix)

[[141  21]
 [ 41  51]]


In [28]:
from sklearn.metrics import classification_report

report = classification_report(Y_test,predicated)
print(report)

             precision    recall  f1-score   support

        0.0       0.77      0.87      0.82       162
        1.0       0.71      0.55      0.62        92

avg / total       0.75      0.76      0.75       254

