# Notes #
1. Data preparation
    + Should we normalize the dataset? If yes, then why?
    + Basic steps
        - Load data
        - Check for missing values(*No missing values in this case*)
        - Convert categorical fields to numerical(All categorical fields are represented  
        as digits)
        - Check class distribution
            + Filtering the dataset with the first two classes.
            + The split should then be roughly 50/50.
        - Normalize data if necessary


**Rough notes**  
1.

---

**Data preparation**

In [2]:
# Data preparation #

# Load CSV using Pandas
import pandas as pd
import numpy as np
# filename = 'adult.data1.csv'
# names = []
data = pd.read_csv("../data/covtype.data.csv", delimiter=",", header=None, index_col=False)

data = data[data[54].isin([1,2])]

Y = data.iloc[:, -1]
X = X = data.iloc[:,:-1]

# Normalize the X values #
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

# feature_list = names[:14]
# # Test train split #
# X = data.loc[:, feature_list]
# Y = data[['target']]

# # Split the dataset into test and train datasets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)

X.shape


(495141, 54)

---

**Keras Model**

In [None]:
# Keras #

### Logistic regression using DNN ###
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

np.random.seed(7)

# Defining model #

model = Sequential()
model.add(Dense(1,input_dim=54,activation='sigmoid'))

# Compile the model #

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model #

model.fit(x_train, y_train, epochs=100, batch_size=1000)

# Evaluate the model #

scores = model.evaluate(x_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

---

**Scikit model**

In [20]:
# Scikit learn #

# Import and create an instance of your model(Logistic regression)

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()

# Train your model using the training dataset

logisticRegr.fit(x_train,y_train)

# Predict the output 

predictions = logisticRegr.predict(x_test)
print(predictions)

score = logisticRegr.score(x_test,y_test)
print(score)


[2 2 1 ... 2 1 2]
0.7756837268795126


---

**Kfold validation**

In [None]:
# Validation #
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import StratifiedKFold
import numpy

seed = 7
numpy.random.seed(seed)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
cvscores = []

for train, test in kfold.split(X, Y):
    # create model
    model = Sequential()
    model.add(Dense(1,input_dim=14,activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(X.iloc[train], Y.iloc[train], epochs=500, batch_size=100, verbose=0)
    # evaluate the model
    scores = model.evaluate(X.iloc[test], Y.iloc[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores))) 


---

**ROC Curve and Confusion matrix**

In [None]:
# Prediction #
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

y_pred = model.predict(x_test)
# y_pred = (y_pred>0.5)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)


plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# Creating the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

---

**Classification Report**

In [None]:
from sklearn.metrics import classification_report

target_names = ['<=50K', '>50']
print(classification_report(y_test, y_pred, target_names=target_names))

---

# Pending #
1. See if you can fine tune keras model(75% acc) to match scikit model(78% acc)
2. Exploratory analysis of the dataset
3. Latex explanations
4. **Cross check what's wrong with ROC  
and Confusion matrix**