# Notes #
1. Data preparation
    + Prepare in such a way that the class balance is different from  
    the previous datasets
    + No missing values
    + What kind of split should I do(for class)?
        + 10/90 
2. Keras model
3. Scikit model
    - Note -- Check Keras wrapper for cross validation
4. Validation


**Rough notes**
1. Check the class distribution of iris dataset
2. Class distribution details
    + Iris - 50/50
    + Adult salary - 25/75

---

**Data preparation**

In [1]:
# Data preparation #

# Load CSV using Pandas
import pandas as pd
import numpy as np

names = ["sex", "length", "diameter", "height", "whole weight",
        "shucked weight", "viscera weight", "shell weight", "rings"]
data = pd.read_csv("abalone.data.csv", delimiter=",", header=None, names=names, index_col=False)
data.head()

# Check for columns that contain missing values #
col_names = data.columns

num_data = data.shape[0]
for c in col_names:
    num_non = data[c].isin([None]).sum()
    if num_non > 0:
        print (c)
        print (num_non)
        print ("{0:.2f}%".format(float(num_non) / num_data * 100))
        print ("\n")



# Convert categorical fields #
categorical_col = ['sex']


for col in categorical_col:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c
    
# col_names_distribution = ['rings']

# for c in col_names_distribution:
#     print(c)
#     print(data[c].value_counts())
    
# Filter dataset to contain 'rings' 9 and 10 #
data = data[data['rings'].isin([9,10])]
data['rings'] = data['rings'].map({9: 0, 10: 1})


feature_list = names[:7]
# Test train split #
X = data.loc[:, feature_list]
Y = data[['rings']]

# Split the dataset into test and train datasets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)


---

**Keras Model**

In [2]:
# Keras #

### Logistic regression using DNN ###
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

np.random.seed(7)

# Defining model #

model = Sequential()
model.add(Dense(1,input_dim=7,activation='sigmoid'))

# Compile the model #

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model #

model.fit(x_train, y_train, epochs=1500, batch_size=200, verbose=0)

# Evaluate the model #

scores = model.evaluate(x_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Using TensorFlow backend.



acc: 59.07%


---

**Scikit model**

In [3]:
# Scikit learn #

# Import and create an instance of your model(Logistic regression)

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()

# Train your model using the training dataset

logisticRegr.fit(x_train,y_train)

# Predict the output 

predictions = logisticRegr.predict(x_test)
# print(predictions)

score = logisticRegr.score(x_test,y_test)
print(score)


0.5944584382871536


  y = column_or_1d(y, warn=True)


---

**Kfold validation**

In [4]:
# Validation #
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import StratifiedKFold
import numpy

seed = 7
numpy.random.seed(seed)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []

for train, test in kfold.split(X, Y):
    # create model
    model = Sequential()
    model.add(Dense(1,input_dim=7,activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(X.iloc[train], Y.iloc[train], epochs=500, batch_size=100, verbose=0)
    # evaluate the model
    scores = model.evaluate(X.iloc[test], Y.iloc[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))


acc: 61.65%
acc: 52.63%
acc: 55.64%
acc: 65.41%
acc: 65.91%
acc: 57.58%
acc: 60.61%
acc: 56.82%
acc: 59.09%
acc: 53.44%
58.88% (+/- 4.34%)


---

**ROC Curve and Confusion matrix**

In [5]:
# Prediction #
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

y_pred = model.predict(x_test)
y_pred = (y_pred>0.5)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)


plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# Creating the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

<Figure size 640x480 with 1 Axes>

[[284 126]
 [194 190]]


---

**Classification Report**

In [6]:
from sklearn.metrics import classification_report

target_names = ['0', '1']
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

          0       0.59      0.69      0.64       410
          1       0.60      0.49      0.54       384

avg / total       0.60      0.60      0.59       794



---

# Pending #
1. Why is the performance so low in both the methods
2. Fix ROC curve