In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('telecom_churn.csv')

In [3]:
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
# axis=1 - удалить колонку; 0 - удалить строку
# inplace=True - удалить из датафрейма
data.drop(['State', 'Voice mail plan'], axis=1, inplace=True)

In [5]:
data['International plan'] = data['International plan'].map({'Yes': 1,
                                                             'No': 0})

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Account length          3333 non-null   int64  
 1   Area code               3333 non-null   int64  
 2   International plan      3333 non-null   int64  
 3   Number vmail messages   3333 non-null   int64  
 4   Total day minutes       3333 non-null   float64
 5   Total day calls         3333 non-null   int64  
 6   Total day charge        3333 non-null   float64
 7   Total eve minutes       3333 non-null   float64
 8   Total eve calls         3333 non-null   int64  
 9   Total eve charge        3333 non-null   float64
 10  Total night minutes     3333 non-null   float64
 11  Total night calls       3333 non-null   int64  
 12  Total night charge      3333 non-null   float64
 13  Total intl minutes      3333 non-null   float64
 14  Total intl calls        3333 non-null   

In [7]:
y = data['Churn'].astype('int')

In [12]:
X = data.drop('Churn', axis=1)

In [13]:
X.shape, y.shape

((3333, 17), (3333,))

In [14]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [15]:
# В документации значения передаются именно в такой форме
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.3, 
                                                      random_state=17)

In [16]:
X_train.shape, X_valid.shape

((2333, 17), (1000, 17))

In [17]:
first_tree = DecisionTreeClassifier(random_state=17)

In [18]:
# Кросс-валидация
# Прогоняем обучение и проверяем на отложенной части 5 раз
# Данные для обучения и проверки в 5-ти случаях берутся разные
# array([0.9143469 , 0.91220557, 0.92077088, 0.90772532, 0.91416309])
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.9138423504976518

In [15]:
# Метод ближайших соседей
from sklearn.neighbors import KNeighborsClassifier

In [16]:
first_knn = KNeighborsClassifier()

In [17]:
# Кросс-валидация
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

0.8671274043984523

<h2>Настраиваем дерево</h2>

In [18]:
# Модуль для поиска лучших параметров
from sklearn.model_selection import GridSearchCV

In [19]:
# Настройки для поиска параметров - перебираем все комбинации признаков
# max_features - максимальное количество признаков
# max_depth - максимальная глубина дерева
tree_params = {'max_depth': np.arange(1, 11), 'max_features':[.5, .7, 1]}

In [20]:
# n_jobs=-1 - использовать все ядра
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [21]:
%%time
tree_grid.fit(X_train, y_train);

CPU times: user 320 ms, sys: 76.5 ms, total: 397 ms
Wall time: 1.47 s


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=17,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 1

In [22]:
# Лучший результат, лучшие параметры
tree_grid.best_score_, tree_grid.best_params_

(0.9391366681677404, {'max_depth': 6, 'max_features': 0.7})

In [23]:
# n_neighbors - количество соседей
knn_params = {'n_neighbors': range(5, 30, 5) }#+ list(range(50, 100, 10))}

In [24]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)

In [25]:
%%time
knn_grid.fit(X_train, y_train);

CPU times: user 621 ms, sys: 2.57 ms, total: 624 ms
Wall time: 623 ms


GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(5, 30, 5)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
knn_grid.best_score_, knn_grid.best_params_

(0.8701289391697531, {'n_neighbors': 10})

In [27]:
tree_valid_pred = tree_grid.predict(X_valid)

<h2>Метрики</h2>

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
# Результат на отложенной выборке
accuracy_score(y_valid, tree_valid_pred)

0.936

In [30]:
from sklearn.metrics import confusion_matrix

In [31]:
confusion_matrix(y_valid, tree_valid_pred)

array([[858,   9],
       [ 55,  78]])

In [32]:
np.bincount(y_valid)

array([867, 133])

In [33]:
from sklearn.tree import export_graphviz

In [34]:
second_tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
second_tree.score(X_valid, y_valid)

0.905

In [35]:
X_valid

Unnamed: 0,Account length,Area code,International plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
1687,90,415,0,0,37.8,80,6.43,155.3,105,13.20,175.0,111,7.88,14.2,5,3.83,3
877,129,415,1,0,267.4,78,45.46,204.2,85,17.36,111.7,146,5.03,5.9,4,1.59,1
1311,90,415,0,0,200.9,92,34.15,164.3,91,13.97,249.0,98,11.21,8.9,7,2.40,1
2151,72,415,0,0,137.6,106,23.39,143.5,94,12.20,273.7,110,12.32,9.6,6,2.59,2
961,97,408,0,0,217.6,81,36.99,320.5,51,27.24,150.7,110,6.78,4.2,3,1.13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,12,408,0,0,249.6,118,42.43,252.4,119,21.45,280.2,90,12.61,11.8,3,3.19,1
3137,58,510,0,0,131.9,96,22.42,167.6,107,14.25,205.9,106,9.27,14.7,5,3.97,3
3188,148,415,1,0,218.7,111,37.18,155.6,133,13.23,277.4,62,12.48,8.2,5,2.21,1
2593,93,408,0,0,149.6,120,25.43,200.7,85,17.06,181.2,107,8.15,14.3,9,3.86,0


<h2>Тестирую модель на придуманных данных</h2>

In [48]:
second_test = {'col1': [12], 'col2': [408], 'col3': [0], 'col4': [0], 'col5': [249.6], 'col6': [118], 'col7': [42.43], 'col8': [252.4], 'col9': [119], 'col10': [21.45], 'col11': [280.2], 'col12': [90], 'col13': [12.61], 'col14': [11.8], 'col15': [3], 'col16': [3.19], 'col17': [1]}
first_test = {'col1': [129], 'col2': [415], 'col3': [1], 'col4': [0], 'col5': [267.4], 'col6': [78], 'col7': [45.46], 'col8': [204.2], 'col9': [85], 'col10': [17.36], 'col11': [111.7], 'col12': [146], 'col13': [5.03], 'col14': [5.9], 'col15': [4], 'col16': [1.59], 'col17': [1]}

In [49]:
second_test_pd = pd.DataFrame(data=second_test)
first_test_pd = pd.DataFrame(data=first_test)

In [50]:
first_test_pd

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17
0,129,415,1,0,267.4,78,45.46,204.2,85,17.36,111.7,146,5.03,5.9,4,1.59,1


In [51]:
tree_grid.predict(second_test_pd)

array([0])