In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import pandas_profiling

plt.rcParams['figure.figsize'] = (5, 3)

In [None]:
ds = pd.read_csv(os.path.join(dirname, filename), index_col=0)

In [None]:
ds.head(3)

In [None]:
ds.info()

In [None]:
plt.figure(figsize=(10, 5))
sns.heatmap(ds.isnull().sum().to_frame(), annot=True, fmt='d', cmap='vlag');

In [None]:
ds.satisfaction.unique()

- neutral or dissatisfied = 0
- satisfied = 1

In [None]:
ds.arrival_delay_in_minutes.fillna(ds.arrival_delay_in_minutes.mean(), inplace=True)
ds.arrival_delay_in_minutes.mean()

In [None]:
ds.satisfaction = [1 if v == 'satisfied' else 0 for v in ds.satisfaction]

In [None]:
ds.head(3)

In [None]:
columns_numeric = []
columns_cat = []

for col in ds.columns:
    
    if col in ['Unnamed: 0', 'satisfaction']:
        continue
    
    if ds[col].dtype in ['int64', 'float64']:
        columns_numeric.append(col)
    else: 
        columns_cat.append(col)

columns_cat, columns_numeric

In [None]:
print('unique values')
print('-'*60)

for col in ds[columns_cat]:
    print(f'{col}: {ds[col].unique()} - {len(ds[col].unique())}pcs')

- Male = 1; Female = 0
- Loyal Customer = 1; disloyal Customer = 0
- Persanal Travel = 1: Business Customer = 0

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(ds.corr(), annot=True, fmt='.3f');

In [None]:
%%time
sns.pairplot(ds[columns_numeric + ['satisfaction']], hue='satisfaction');

In [None]:
sns.countplot(ds.satisfaction)
plt.title("Target");

In [None]:
plt.figure(figsize=(16, 8))

for i, col in enumerate(columns_cat):
    plt.subplot(2, 4, i+1)
    sns.countplot(data=ds, x=col)
    plt.subplot(2, 4, i+5)  
    sns.countplot(data=ds, x=col, hue='satisfaction')

- Satisfaction is not strongly influenced by gender, although women are slightly more dissatisfied than men.   
  There are more women in the data than men. 
- There are more loyal passengers in the data than ordinary ones.   
  Disloyal passengers are most often dissatisfied with the flight.
- Passengers who fly for business are much more likely to be satisfied than those who fly privately.
  Passengers on private flights are almost always dissatisfied.
- Passengers who fly for business are much more likely to be satisfied than those who fly privately.
  Passengers on private flights are almost always dissatisfied.

In [None]:
plt.figure(figsize=(23, 18))
for i, col in enumerate(columns_numeric):
    plt.subplot(4, 5, i+1)
    ds[col][ds.satisfaction == 0].hist(bins=20)
    ds[col][ds.satisfaction == 1].hist(bins=20, alpha=0.6)    
    plt.title(f'{col}')

- satisfaction increases with age. Most dissatisfied passengers ~ 25 years old, most satisfied ~ 45 years old
- satisfaction increases with distance, and the shorter the distance, the greater the dissatisfaction
- an increase in the quality of the additional services provided increases satisfaction.     

  greatly reduces satisfaction: 
  - low quality wi-fi
  - lack of cleanliness
  - low quality or lack of ease of online booking
  - poor quality food or drinks or lack of them
  - low seating comfort
  
  greatly increases satisfaction:
  - quality of food and drinks
  - online boarding
  - quality entertainment on board (video, music, etc.)
  - high quality service on board
  - quality luggage service

### log

In [None]:
plt.subplot(121)
plt.hist(ds.flight_distance)
plt.subplot(122)
plt.hist(np.log(ds.flight_distance));

### encoding categorical

In [None]:
!pip install MulticoreTSNE

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist

from MulticoreTSNE import MulticoreTSNE as MTSNE

In [None]:
ds_scaled = ds.copy(deep=True)

In [None]:
ds_scaled = pd.get_dummies(ds_scaled, columns=columns_cat, drop_first=True, prefix_sep='_')
columns_all = ds_scaled.columns

In [None]:
scaler = StandardScaler()

In [None]:
ds_scaled[columns_numeric] = scaler.fit_transform(ds_scaled[columns_numeric])

In [None]:
ds_scaled.shape

In [None]:
ds.shape

In [None]:
columns_all

<b>Sample</b>

In [None]:
ds_sample_index =  np.random.choice(ds_scaled.index, int(ds_scaled.shape[0]/10), replace=False)
ds_sample = ds_scaled.loc[ds_sample_index]
ds_sample.shape

<b>Let's look at the data in 2D, labeling it with 'satisfaction'</b>

### TSNE

In [None]:
tsne = TSNE(perplexity=30, random_state=42, n_jobs=-1)

In [None]:
%%time
ds_tsne = tsne.fit_transform(ds_sample.drop(['satisfaction'], axis=1))

In [None]:
plt.figure(figsize=(22, 10))
plt.subplot(121)
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1])
plt.subplot(122)
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1], c=ds_sample['satisfaction'], cmap='Set1')
plt.title('Satisfaction');

<b>The picture shows that the data is separable, albeit implicitly.
Separate clusters of satisfied and unsatisfied passengers are visible.</b>

### cluster

<b>Find the optimal number of clusters</b>

In [None]:
distance_mat = pdist(ds_sample.drop(['satisfaction'], axis=1))
Z = hierarchy.linkage(distance_mat, 'ward')

In [None]:
plt.figure(figsize=(16, 6))
dn =hierarchy.dendrogram(Z, truncate_mode='lastp', show_leaf_counts=True, show_contracted=True)
plt.axhline(y=190, c='gray');

In [None]:
plt.figure(figsize=(16, 6))
dn =hierarchy.dendrogram(Z, truncate_mode='lastp', p=2, show_leaf_counts=True, show_contracted=True);

In [None]:
max_d = 190
clusters = hierarchy.fcluster(Z, max_d, criterion='distance')

<b>According to the dendogram, the data can be optimally divided into 2 clusters

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1], c=clusters, cmap='Set1');

### Elbow method.  
<b>Using the "elbow method" we will find the optimal number of clusters.

In [None]:
Z.shape

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(range(1, 16), Z[-15:, 2][::-1], marker='o');

<b>The optimal number of clusters is 2, 6

In [None]:
clusters2 = hierarchy.fcluster(Z, 2, criterion='maxclust')
clusters3 = hierarchy.fcluster(Z, 3, criterion='maxclust')
clusters6 = hierarchy.fcluster(Z, 6, criterion='maxclust')

plt.figure(figsize=(22, 7))
plt.subplot(131)
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1], c=clusters2, cmap='Set1')
plt.title('2 clusters')
plt.subplot(132)
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1], c=clusters3, cmap='Set1')
plt.title('3 clusters')
plt.subplot(133)
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1], c=clusters6, cmap='Set1')
plt.title('6 clusters');

### Inertia

In [None]:
inertia = []
k = range(2, 16)

for i in k:
    km = KMeans(n_clusters=i)
    km.fit(ds_sample.drop(['satisfaction'], axis=1))
    inertia.append(silhouette_score(ds_sample.drop(['satisfaction'], axis=1), km.labels_))

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(k, inertia, marker='o')
plt.xlabel('k')
plt.ylabel('inertia')

<b> by inertia, the optimal number of clusters is 2

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(121)
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1], c=ds_sample['satisfaction'], cmap='Set1')
plt.title('cluster_satisfaction')
plt.subplot(122)
plt.scatter(ds_tsne[:, 0], ds_tsne[:, 1], c=clusters2, cmap='Set1_r')
plt.title('cluster_2');

<b>The picture shows the data separated by satisfaction tags (left) and by an algorithm (right). Visually, the divisions are similar.