
#  Import attrition dataset and import libraries such as pandas, matplotlib.pyplot, numpy, and seaborn.
#  Build up a logistic regression model to predict which employees are likely to attrite.
#  Exploratory data analysis
*     Find the age distribution of employees in IBM
*     Explore attrition by age
*     Explore data for Left employees
*     Find out the distribution of employees by the education field
*     Give a bar chart for the number of married and unmarried employees


In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
from patsy import dmatrices
import sklearn
import seaborn as sn

In [None]:
dataframe=pd.read_csv("../input/ibm-data/IBM Attrition Data.csv")

In [None]:
dataframe.head()

In [None]:
names = dataframe.columns.values 
print(names)

In [None]:
# histogram for age
plt.figure(figsize=(10,8))
dataframe['Age'].hist(bins=70)
plt.title("Age distribution of Employees")
plt.xlabel("Age")
plt.ylabel("# of Employees")
plt.show()

In [None]:
# explore data for Attrition by Age
plt.figure(figsize=(14,10))
plt.scatter(dataframe.Attrition,dataframe.Age, alpha=.55)
plt.title("Attrition by Age ")
plt.ylabel("Age")
plt.grid(b=True, which='major',axis='y')
plt.show()

In [None]:
# explore data for Left employees breakdown
plt.figure(figsize=(8,6))
dataframe.Attrition.value_counts().plot(kind='barh',color='blue',alpha=.65)
plt.title("Attrition breakdown ")
plt.show()

In [None]:
# explore data for Education Field distribution
plt.figure(figsize=(10,8))
dataframe.EducationField.value_counts().plot(kind='barh',color='g',alpha=.65)
plt.title("Education Field Distribution")
plt.show()

In [None]:
# explore data for Marital Status
plt.figure(figsize=(8,6))
dataframe.MaritalStatus.value_counts().plot(kind='bar',alpha=.5)
plt.show()

In [None]:
dataframe.describe()

In [None]:
dataframe.info()

In [None]:
dataframe.columns

In [None]:
dataframe.std()

In [None]:
dataframe['Attrition'].value_counts()

In [None]:
dataframe['Attrition'].dtypes

In [None]:
dataframe['Attrition'].replace('Yes',1, inplace=True)
dataframe['Attrition'].replace('No',0, inplace=True)

In [None]:
dataframe.head(10)

In [None]:
# building up a logistic regression model
X = dataframe.drop(['Attrition'],axis=1)
X.head()
Y = dataframe['Attrition']
Y.head()

In [None]:
dataframe['EducationField'].replace('Life Sciences',1, inplace=True)
dataframe['EducationField'].replace('Medical',2, inplace=True)
dataframe['EducationField'].replace('Marketing', 3, inplace=True)
dataframe['EducationField'].replace('Other',4, inplace=True)
dataframe['EducationField'].replace('Technical Degree',5, inplace=True)
dataframe['EducationField'].replace('Human Resources', 6, inplace=True)

In [None]:
dataframe['EducationField'].value_counts()

In [None]:
dataframe['Department'].value_counts()

In [None]:
dataframe['Department'].replace('Research & Development',1, inplace=True)
dataframe['Department'].replace('Sales',2, inplace=True)
dataframe['Department'].replace('Human Resources', 3, inplace=True)

In [None]:
dataframe['Department'].value_counts()

In [None]:
dataframe['MaritalStatus'].value_counts()

In [None]:
dataframe['MaritalStatus'].replace('Married',1, inplace=True)
dataframe['MaritalStatus'].replace('Single',2, inplace=True)
dataframe['MaritalStatus'].replace('Divorced',3, inplace=True)

In [None]:
dataframe['MaritalStatus'].value_counts()

In [None]:
x=dataframe.select_dtypes(include=['int64'])
x.dtypes

In [None]:
x.columns

In [None]:
y=dataframe['Attrition']

In [None]:
y.head()

In [None]:
y, x = dmatrices('Attrition ~ Age + Department + \
                  DistanceFromHome + Education + EducationField + YearsAtCompany',
                  dataframe, return_type="dataframe")
print (x.columns)

In [None]:
y = np.ravel(y)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model = model.fit(x, y)

# check the accuracy on the training set
model.score(x, y)

In [None]:
y.mean()

In [None]:
X_train,X_test,y_train,y_test=sklearn.model_selection.train_test_split(x,y, test_size=0.3, random_state=0)
model2=LogisticRegression()
model2.fit(X_train, y_train)

In [None]:
predicted= model2.predict(X_test)
print (predicted)

In [None]:
probs = model2.predict_proba(X_test)
print (probs)

In [None]:
from sklearn import metrics

print (metrics.accuracy_score(y_test, predicted))
print (metrics.roc_auc_score(y_test, probs[:, 1]))

In [None]:
print (metrics.confusion_matrix(y_test, predicted))
print (metrics.classification_report(y_test, predicted))

In [None]:
print (X_train)

In [None]:
#add random values to KK according to the parameters mentioned above to check the proabily of attrition of the employee
kk=[[1.0, 23.0, 1.0, 500.0, 3.0, 24.0, 1.0]]
print(model.predict_proba(kk))