In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
from tqdm import tqdm
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import seaborn as sns

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

import warnings
warnings.simplefilter('ignore')

%matplotlib inline
plt.rcParams['figure.figsize'] = (6,4)
plt.rcParams['figure.dpi'] = 150

init_notebook_mode(connected=True)

In [3]:
csv_path = '/home/roman/data/telecom_churn.csv'
df = pd.read_csv(csv_path)

df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
state                     3333 non-null object
account length            3333 non-null int64
area code                 3333 non-null int64
phone number              3333 non-null object
international plan        3333 non-null object
voice mail plan           3333 non-null object
number vmail messages     3333 non-null int64
total day minutes         3333 non-null float64
total day calls           3333 non-null int64
total day charge          3333 non-null float64
total eve minutes         3333 non-null float64
total eve calls           3333 non-null int64
total eve charge          3333 non-null float64
total night minutes       3333 non-null float64
total night calls         3333 non-null int64
total night charge        3333 non-null float64
total intl minutes        3333 non-null float64
total intl calls          3333 non-null int64
total intl charge         3333 non-null float64

# ff.create_facet_grid
https://plot.ly/python/facet-plots/

In [12]:
fig = ff.create_facet_grid(
    df,
    x='total day minutes',
    y='total night calls',
    facet_col='churn',
    marker=dict(size=4,
                line=dict(width=0.3,
                          color='rgb(230,230,230)'))
)

iplot(fig)

# Пример поиска аномалий одноклассовым SVM

In [13]:
from sklearn import svm

def oneclass_svm():
    X_train, X_test = np.random.normal(0, 1, (10000, 2)), np.random.normal(0, 1, (10000, 2))
    # Generate some abnormal novel observations
    X_outliers = np.random.normal(2.5, 0.3, (1000, 2))
    # fit the model
    clf = svm.OneClassSVM(nu=0.01, kernel="rbf", gamma=0.1)
    clf.fit(X_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    y_pred_outliers = clf.predict(X_outliers)
    n_error_train = y_pred_train[y_pred_train == -1].size
    n_error_test = y_pred_test[y_pred_test == -1].size
    n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

    print(f'n_error_train = {n_error_train}\n'
          f'n_error_test = {n_error_test}\n'
          f'n_error_outliers = {n_error_outliers}')
    return X_train, X_outliers

regulars, anomalies = oneclass_svm()

# Посмотрим, как же отличаются распределения распределения аномалий и нормальных объектов
trace1 = go.Scatter(
    x = regulars[:, 0],
    y = regulars[:, 1],
    mode = 'markers'
)
trace2 = go.Scatter(
    x = anomalies[:, 0],
    y = anomalies[:, 1],
    mode = 'markers'
)

data = [trace1, trace2]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')


n_error_train = 99
n_error_test = 126
n_error_outliers = 65


In [11]:
X = np.vstack((np.array(regulars), np.array(anomalies)))
print(X.shape)
y = np.array([0 for _ in range(len(regulars))] + [1 for _ in range(len(anomalies))]).reshape(-1, 1)
print(y.shape)
X = np.hstack((X, y))
print(X.shape)
test_df = pd.DataFrame(data=X, columns=['x', 'y', 'label'])

fig = ff.create_facet_grid(
    test_df,
    x='x',
    y='y',
    facet_col='label',
    marker=dict(size=4,
                line=dict(width=0.3,
                          color='rgb(230,230,230)'))
)

iplot(fig)

(11000, 2)
(11000, 1)
(11000, 3)
