# Welcome to Jupyter!

In [None]:
import pandas as pd

X = pd.read_csv("PS2_data.csv")
X.columns.values
print("Data Type of the features: \n")
print(X.dtypes)
print('\n')
print("Check for any null/empty values in the features: \n")
print(X.isnull().any())

In [None]:
# Create Dummy values for qualitative features 

qual_vars=['Department','salary']
for var in qual_vars:
    qual_list='var'+'_'+var
    qual_list = pd.get_dummies(X[var], prefix=var)
    X1=X.join(qual_list)
    X=X1
    
X.drop(X.columns[[7, 8]], axis=1, inplace=True)
X.columns.values    

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
pos = X[X["Attrition"] == 1].shape[0]
neg = X[X["Attrition"] == 0].shape[0]
print("Positive examples = {}".format(pos))
print("Negative examples = {}".format(neg))
print("Proportion of positive to negative examples = {:.2f}%".format((pos / neg) * 100))
sns.countplot(X["Attrition"])
plt.xticks((0, 1), ["Didn't leave", "Left"])
plt.xlabel("Left")
plt.ylabel("Count")
plt.title("Class counts");

In [None]:
## Split Data into two sets in the ratio of 80:20

X_1 = X.loc[:, X.columns != "Attrition"].values
y_1 = X.loc[:, X.columns == "Attrition"].values.flatten()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_1, y_1, test_size=0.2, stratify=y_1, random_state=1)


In [None]:
## Logistic Regression
from sklearn.preprocessing import normalize

X_tr_nor = normalize(X_train)
X_ts_nor = normalize(X_test)

X_tr_nor

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(C=5000, solver = "liblinear").fit(X_tr_nor , y_train )
y_pred= model.predict(X_ts_nor)
print('Accuracy of Logistic Regression: {:.6f}' .format(accuracy_score(y_test, y_pred)))

In [None]:
## SVM accuracy
from sklearn.svm import SVC
model1= SVC(probability=True).fit(X_train, y_train)
y_pred2 = model1.predict(X_test)
print('Accuracy of Support Vector Machine: {:.6f}' .format(accuracy_score(y_test, y_pred2)))

In [None]:
## Random Forest accuracy
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier().fit(X_train, y_train)
y_pred3 = model2.predict(X_test)
print('Accuracy of Random Forest Classifier: {:.6f}' .format(accuracy_score(y_test, y_pred3)))

In [None]:
## Random Forest F1 score

from sklearn.metrics import classification_report
print(classification_report(y_test, model2.predict(X_test)))


In [None]:
### Logistic Regression F1 score
print(classification_report(y_test, model.predict(X_ts_nor)))

In [None]:
## SVM F1 score
print(classification_report(y_test, model1.predict(X_test)))

In [None]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

## cross validation for Random Forest 10 fold
kfold = model_selection.KFold(n_splits=10, random_state=7)
scoring = 'accuracy'
results = model_selection.cross_val_score(model2, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy Random Forest: %.3f" % (results.mean()))

In [1]:
## Logistic Regression crossvalidation 10 fold
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X_tr_nor, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy Logisitic Reggression: %.3f" % (results.mean()))

NameError: name 'model_selection' is not defined

In [None]:
## SVM crossvalidation 10 fold
scoring = 'accuracy'
results = model_selection.cross_val_score(model1, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy SVM: %.3f" % (results.mean()))

In [None]:
## Confusion Matrix of Logistic Regression Model

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix

logreg_y_pred = model.predict(X_ts_nor)
logreg_cm = metrics.confusion_matrix(logreg_y_pred, y_test, [1,0])
sns.heatmap(logreg_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Logistic Regression')
plt.savefig('logistic_regression')

In [None]:
## Confusion matrix for SVM Model 

svm_y_pred = model1.predict(X_test)
svm_cm = metrics.confusion_matrix(svm_y_pred, y_test, [1,0])
sns.heatmap(svm_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Support Vector Machine')
plt.savefig('Support_Vector_Machine')

In [None]:
## Confusion Matrix for Random Forest model
RF_y_pred = model2.predict(X_test)
RF_cm = metrics.confusion_matrix(RF_y_pred, y_test, [1,0])
sns.heatmap(RF_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Random Forest Classification')
plt.savefig('Random Forest Classification')

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])


rf_roc_auc = roc_auc_score(y_test, model2.predict(X_test))
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, model2.predict_proba(X_test)[:,1])

svm_roc_auc = roc_auc_score(y_test, model1.predict(X_test))
s_fpr, s_tpr, s_thresholds = roc_curve(y_test, model1.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(rf_fpr, rf_tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot(s_fpr, s_tpr, label='Support Vector Machine (area = %0.2f)' % svm_roc_auc)

plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC')
plt.show()

This repo contains an introduction to [Jupyter](https://jupyter.org) and [IPython](https://ipython.org).

Outline of some basics:

* [Notebook Basics](../examples/Notebook/Notebook%20Basics.ipynb)
* [IPython - beyond plain python](../examples/IPython%20Kernel/Beyond%20Plain%20Python.ipynb)
* [Markdown Cells](../examples/Notebook/Working%20With%20Markdown%20Cells.ipynb)
* [Rich Display System](../examples/IPython%20Kernel/Rich%20Output.ipynb)
* [Custom Display logic](../examples/IPython%20Kernel/Custom%20Display%20Logic.ipynb)
* [Running a Secure Public Notebook Server](../examples/Notebook/Running%20the%20Notebook%20Server.ipynb#Securing-the-notebook-server)
* [How Jupyter works](../examples/Notebook/Multiple%20Languages%2C%20Frontends.ipynb) to run code in different languages.

You can also get this tutorial and run it on your laptop:

    git clone https://github.com/ipython/ipython-in-depth

Install IPython and Jupyter:

with [conda](https://www.anaconda.com/download):

    conda install ipython jupyter

with pip:

    # first, always upgrade pip!
    pip install --upgrade pip
    pip install --upgrade ipython jupyter

Start the notebook in the tutorial directory:

    cd ipython-in-depth
    jupyter notebook