In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [None]:
data = pd.read_csv("../input/telecom-churn/telecom_churn.csv")

In [None]:
data.head()

In [None]:
data.drop(['state','voice mail plan'], axis=1, inplace=True)

In [None]:
data['international plan'] = data['international plan'].map({'yes':1,'no':0})

In [None]:
data.drop(['phone number'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
y = data['churn'].astype('int')

In [None]:
X = data.drop('churn', axis=1)

In [None]:
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17)

In [None]:
X_train.shape, X_valid.shape

In [None]:
first_tree = DecisionTreeClassifier(random_state=17)

In [None]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
first_knn = KNeighborsClassifier()

In [None]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

Настройка max_depth для дерева

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
tree_params = {'max_depth': np.arange(1,11), 'max_features': [.5, .7, 1]}

In [None]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [None]:
%%time
tree_grid.fit(X_train, y_train)

In [None]:
tree_grid.best_score_, tree_grid.best_params_

In [None]:
knn_params = {'n_neighbors': range(5,30,5)}

In [None]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)

In [None]:
%%time
knn_grid.fit(X_train, y_train)

In [None]:
knn_grid.best_score_, knn_grid.best_params_

In [None]:
tree_valid_pred = tree_grid.predict(X_valid)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_valid, tree_valid_pred)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_valid, tree_valid_pred)

In [None]:
np.bincount(y_valid)

In [None]:
from sklearn.tree import export_graphviz

In [None]:
second_tree = DecisionTreeClassifier(max_depth=3).fit(X_train,y_train)
second_tree.score(X_valid, y_valid)

In [None]:
export_graphviz(second_tree, out_file='telecom_tree2.png', feature_names=X.columns, filled=True)

In [None]:
!ls -l *.png

In [None]:
!dot -Tpng telecom_tree2.dot -o telecom_tree2.png

Практика. Дерево решений в задаче предсказания выживания пассажиров "Титаника". 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv("../input/titanic-train/titanic_train.csv")
test_df = pd.read_csv("../input/testrag/test.csv")

In [None]:
y = train_df['Survived']

In [None]:
train_df.head()

In [None]:
train_df.describe(include='all')

In [None]:
test_df.describe(include='all')

Заполним пропуски медианными значениями.

In [None]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna('S', inplace=True)
test_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

Кодируем категориальные признаки Pclass, Sex, SibSp, Parch и Embarked с помощью техники One-Hot-Encoding.

In [None]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['Pclass'],
                                              prefix="PClass"),
                      pd.get_dummies(train_df['Sex'], prefix="Sex"),
                      pd.get_dummies(train_df['SibSp'], prefix="SibSp"),
                      pd.get_dummies(train_df['Parch'], prefix="Parch"),
                     pd.get_dummies(train_df['Embarked'], prefix="Embarked")],
                     axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df['Pclass'], 
                                             prefix="PClass"),
                      pd.get_dummies(test_df['Sex'], prefix="Sex"),
                      pd.get_dummies(test_df['SibSp'], prefix="SibSp"),
                      pd.get_dummies(test_df['Parch'], prefix="Parch"),
                    pd.get_dummies(test_df['Embarked'], prefix="Embarked")],
                     axis=1)

In [None]:
train_df.drop(['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 
               'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
              axis=1, inplace=True)
test_df.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
             axis=1, inplace=True)

In [None]:
train_df.shape, test_df.shape

In [None]:
set(test_df.columns) - set(train_df.columns)

In [None]:
test_df.drop(['Parch_9'], axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
my_tree = DecisionTreeClassifier(random_state=17)

In [None]:
# tree params for grid search
tree_params = {'max_depth': list(range(1, 5)), 
               'min_samples_leaf': list(range(1, 5))}

In [None]:
tree_grid = GridSearchCV(my_tree, tree_params, cv=5, n_jobs=-1)

In [None]:
tree_grid.fit(train_df, y)

In [None]:
 tree_grid.best_score_, tree_grid.best_params_

In [None]:
tree_predict = tree_grid.predict(test_df)