In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
Datos=pd.DataFrame(pd.read_csv(os.path.join(dirname, filename)))

In [None]:
Datos

In [None]:
Datos.isna().sum() #Verificar valores incompletos

In [None]:
Datos.dtypes # ver los tipos de datos en el dataframe

### Convertir nuestra variable a predercir a una de tipo numérico

In [None]:
Datos['Attrition'].replace({'Yes':'1', 'No':'0'}, inplace=True)
Datos['Attrition'] = pd.to_numeric(Datos['Attrition']) 

In [None]:
Datos

## ALGUNAS GRAFICAS PARA ENTENDER EL NEGOCIO

### ¿Que tanto viajan los empleados?

In [None]:
sns.countplot(x='BusinessTravel',data=Datos)
plt.title('Empleados por frecuencia de viajes', weight='bold', size=25, y=1)
#plt.gcf().set_size_inches(10, 5)
plt.xticks(rotation=30)

### ¿Que edad tienen los empleados?

In [None]:
sns.histplot(x='Age', kde=True, data=Datos, bins=int(np.sqrt(Datos['Age'].count())))
plt.title('Distribución edad empleados', weight='bold', size=25, y=1)
plt.gcf().set_size_inches(10, 5)
plt.xticks(rotation=30)

### Por áreas ¿Como estan distribuidos los empleados? 

In [None]:
data_pie= Datos.groupby("Department")['Department'].count()
data_pie
data_pie.plot.pie(autopct="%.1f%%");
plt.title('% Empleados por Departamnto', weight='bold', size=25, y=1)

### ¿Que formación tienen los empleados?

In [None]:
sns.countplot(x='EducationField',data=Datos)
plt.title('Empleados por área de formacion', weight='bold', size=25, y=1)
plt.gcf().set_size_inches(10, 5)
plt.xticks(rotation=30)

### ¿Cuantos años llevan trabajando los empleados?

In [None]:
sns.histplot(x='TotalWorkingYears', kde=True, data=Datos, bins=int(np.sqrt(Datos['TotalWorkingYears'].count())))
plt.title('Distribución años de experiencia empleados', weight='bold', size=25, y=1)
plt.gcf().set_size_inches(10, 5)
plt.xticks(rotation=30)

## Separamos varialbes numericas para hacer analisis de correlaciones

In [None]:
Numeric_data=pd.DataFrame(Datos.select_dtypes(include=['int64']))
Numeric_data #visualizacion de variables numéricas

In [None]:
Numeric_data.corr().dtypes


In [None]:
sns.heatmap(Numeric_data.corr(), annot=True, annot_kws={"size":8})
plt.title('Correlació entre variables numéricas', weight='bold', size=25, y=1)
plt.gcf().set_size_inches(15, 8)
plt.xticks(rotation=90)

### Crear un listado de todas las correlaciones

In [None]:
corelation=Numeric_data.corr()
corr_pairs = corelation.unstack()
print(corr_pairs)

### Ordenar los valores de correlaciones

In [None]:
sorted_pairs = corr_pairs.sort_values(kind="quicksort")
print(sorted_pairs)

### Seleccionar las correlaciones más fuertes (Mayores a 60%)

In [None]:
strong_pairs = sorted_pairs[abs(sorted_pairs) > 0.6]
print(strong_pairs[[0,2,4,6,8,10,12,14,16,18]])

### Crear una matriz con las columnas que poseen las correlaciones mas fuertes

In [None]:
Datos_para_analisis=Numeric_data[['YearsAtCompany', 'TotalWorkingYears', 
                                  'YearsInCurrentRole', 'YearsAtCompany', 'YearsWithCurrManager', 
                                 'MonthlyIncome', 'PercentSalaryHike', 'JobLevel', 'PerformanceRating',
                                 'YearsSinceLastPromotion', 'Age']]

In [None]:
Datos_para_analisis

### Añadir la variable a predecir a nuestra matriz (En este caso "Attrition")

In [None]:
Datos_para_analisis= Datos_para_analisis.assign(Attrition=Numeric_data['Attrition'].values)

In [None]:
Datos_para_analisis

# Probaremos diferentes modelos de predicción

### Regresión logística

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
X = np.array(Datos_para_analisis.drop(['Attrition'],1))
y = np.array(Datos_para_analisis['Attrition'])

In [None]:
print(X.shape)
print(y.shape)
y

In [None]:
classifier = LogisticRegression()
classifier.fit(X, y)
predictions = classifier.predict(X)

print('Número de instancias a predecir: {}'.format(y.shape[0]))
print('Valores de verdad: {}'.format(y))
print('Valores predichos: {}'.format(predictions))


In [None]:
print("Score para la regresión logística:",classifier.score(X, y))

In [None]:
prediction = classifier.predict(X)
cnf_matrix = confusion_matrix(y, prediction)
print("Matriz de confusión")
print(cnf_matrix)


### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier2 = GaussianNB()
classifier2.fit(X, y);

prediction2 = classifier2.predict(X)
cnf_matrix2 = confusion_matrix(y, prediction2)

print("Score para Naive Bayes:",classifier2.score(X, y))
print("Matriz de confusión")
print(cnf_matrix2)

### Custering por K- Means

In [None]:
from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, DBSCAN
from IPython.display import HTML
from sklearn.metrics import silhouette_score

In [None]:
n_clusters = 3

In [None]:
km = KMeans( n_clusters = n_clusters)
km.fit(X)
y = km.predict(X)

In [None]:
def experiment_number_of_clusters(X, clustering, show_metric=True,
                                  plot_data=True, plot_centers=True, plot_boundaries=False):
    plt.figure(figsize=(15,6))
    for n_clusters in range(2,10):
        clustering.n_clusters = n_clusters
        y = clustering.fit_predict(X)

        cm = plt.cm.plasma
        plt.subplot(2,4,n_clusters-1)

        plot_cluster_predictions(clustering, X, n_clusters, cm, 
                                 plot_data, plot_centers, show_metric)
        

def plot_cluster_predictions(clustering, X, n_clusters = None, cmap = plt.cm.plasma,
                             plot_data=True, plot_centers=True, show_metric=False,
                             title_str=""):

    assert not hasattr(clustering, "n_clusters") or \
           (hasattr(clustering, "n_clusters") and n_clusters is not None), "must specify `n_clusters` for "+str(clustering)

    if n_clusters is not None:
        clustering.n_clusters = n_clusters

    y = clustering.fit_predict(X)
    # remove elements tagged as noise (cluster nb<0)
    X = X[y>=0]
    y = y[y>=0]

    if n_clusters is None:
        n_clusters = len(np.unique(y))

    if plot_data:        
        plt.scatter(X[:,0], X[:,1], color=cmap((y*255./(n_clusters-1)).astype(int)), alpha=.5)
    if plot_centers and hasattr(clustering, "cluster_centers_"):
        plt.scatter(clustering.cluster_centers_[:,0], clustering.cluster_centers_[:,1], s=150,  lw=3,
                    facecolor=cmap((np.arange(n_clusters)*255./(n_clusters-1)).astype(int)),
                    edgecolor="black")   

    if show_metric:
        if hasattr(clustering, 'inertia_'):
          inertia = clustering.inertia_
        else:
          inertia = 0
        sc = silhouette_score(X, y) if len(np.unique(y))>1 else 0
        plt.title("n_clusters %d, inertia=%.0f sc=%.3f"%(n_clusters, inertia, sc)+title_str)
    else:
        plt.title("n_clusters %d"%n_clusters+title_str)

    plt.axis("off")
    return

In [None]:
experiment_number_of_clusters(X, KMeans(), show_metric=False)

In [None]:
Sum_of_squared_distances = []
K = range(2,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.show()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix


In [None]:
X = np.array(Datos_para_analisis.drop(['Attrition'],1))
y = np.array(Datos_para_analisis['Attrition'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)
y

In [None]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
rfc_pred = model_rf.predict(X_test)
rfc_pred
model_rf.score(X_test, y_test)

In [None]:
y_pred_rf = model_rf.predict(X_test)
false_positive_rate_rf, true_positive_rate_rf, thresholds = roc_curve(y_test, y_pred_rf)
roc_auc_rf = auc(false_positive_rate_rf, true_positive_rate_rf)
roc_auc_rf

In [None]:
cm_rf = confusion_matrix(y_test,y_pred_rf)
print('Confusion Matrix : \n', cm_rf)

total1=sum(sum(cm_rf))
#####from confusion matrix calculate accuracy
accuracy1=(cm_rf[0,0]+cm_rf[1,1])/total1
print ('Accuracy RF : ', accuracy1)

sensitivity1 = cm_rf[0,0]/(cm_rf[0,0]+cm_rf[0,1])
print('Sensitivity RF: ', sensitivity1 )

specificity1 = cm_rf[1,1]/(cm_rf[1,0]+cm_rf[1,1])
print('Specificity RF: ', specificity1)

# # Los mejores resultado se obtienen con los modelos de regresión logística y randdom forest con scores de 0.83 y 0.82 respectivamente