# Decision Tree

## Шаардлагатай сангууд

In [None]:
import numpy as np 
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

## Дата

In [None]:
data = pd.read_csv("../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
data[0:5]

## EDA

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include=['object'])

In [None]:
data['MonthlyIncome'].hist()

In [None]:
x = data[data['Attrition'].str.contains("Yes")]
y = data[data['Attrition'].str.contains("No")]
plt.hist(y['MonthlyIncome'], alpha=0.5,label='0')
plt.hist(x['MonthlyIncome'], alpha=0.5,label='1')
plt.legend(loc='upper right')
plt.show()

## Preprocessing

In [None]:
X = data.drop('Attrition', axis = 1)
X[0:5]

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_features = X.columns[X.dtypes == object]
# Label Encoding
for f in cat_features:
    lbl = LabelEncoder()
    lbl.fit(list(X[f].values))
    X[f] = lbl.transform(list(X[f].values))

In [None]:
X

In [None]:
y = data['Attrition']
y.unique()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

In [None]:
X_train

## Modeling

In [None]:
Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
Tree 

In [None]:
Tree.fit(X_train,y_train)

## Feature Importance

In [None]:
 importances = pd.DataFrame({"feature": X_train.columns, "importance": Tree.feature_importances_})
 importances.sort_values("importance", ascending=False)[:10]

In [None]:
sns.barplot(data=importances.sort_values("importance", ascending=False).head(8), x="importance", y="feature")

## Prediction

In [None]:
predTree = Tree.predict(X_test)

In [None]:
print (predTree [0:10])
print (y_test [0:10])

## Evaluation

In [None]:
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, predTree))

In [None]:
sns.heatmap(confusion_matrix(y_test, predTree),annot=True,fmt='2.0f')

## Визуалчлал

In [None]:
!pip install --upgrade scikit-learn==0.20.3

In [None]:
!pip install pydotplus

In [None]:
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [None]:
dot_data = StringIO()
filename = "tree.png"
featureNames = X.columns
targetNames = data['Attrition'].unique().tolist()
out=tree.export_graphviz(Tree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_train), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

### Ашигласан материал
* IBM Machine Learning with Python