<a href="https://colab.research.google.com/github/sebasruggero/python/blob/main/Nps_Eda_Cluster_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import gspread

### Montamos el drive de google para poder acceder al csv 

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

### Accedemos al csv


In [None]:
df=pd.read_csv('gdrive/My Drive/DATA/npsData.csv')

### Aplicamos la funcion shape para ver la cantidad de registros y columnas que posee el dataframe


In [None]:
df.shape

### Debemos identificar las columnas que poseen valores nulos para poder analizar el impacto de estas.

In [None]:
print(df.isnull().sum())

### Hacemos la funcion para determinar si corresponde a un Promoter, Passive o Detractor

In [None]:
def categorize_nps(ltr):
  if ltr == 9 or ltr == 10:
    return "Promoter"
  elif ltr == 8 or ltr == 9:
    return "Passive"
  elif ltr >= 0 or ltr <= 6:
    return "Detractor"
  else: 
    return 'invalid'

### Aplicamos la función

In [None]:
df['nps_group'] = df['ltr'].apply(categorize_nps)

In [None]:
df.dtypes

### Realizamos las transformaciones en los tipos de datos 

In [None]:
df['ResponseDate'] = pd.to_datetime(df['ResponseDate'])
df['caseID'] = df['caseID'].astype(str)
df['assignToDisp'] = pd.to_numeric(df['assignToDisp'], errors='coerce')
df['assignToDisp'] = df['assignToDisp'].round(2)
df['mttr'] = pd.to_numeric(df['mttr'], errors='coerce')
df['mttr'] = df['mttr'].round(2)
df['ttrGros'] = pd.to_numeric(df['ttrGros'], errors='coerce')
df['ttrGros'] = df['ttrGros'].round(2)
df['ltr'] = df['ltr'].astype(float)


In [None]:
df.dtypes

In [None]:
df.head()


In [None]:
print(df.isnull().sum())


### Eda

In [None]:
df_eda = df

In [None]:
# Estadísticas resumidas del dataframe
print(df_eda.describe())

### Distribucion de los datos

In [None]:
sns.displot( x="ltr", 
            data=df_eda, 
            hue='slaYield', 
            col="nps_group", 
            kind="kde",
            col_wrap=3,
            fill=True);

In [None]:
sns.displot( x="ltr", 
            data=df_eda, 
            hue='slaYield', 
            col="supportSegment", 
            kind="kde",
            col_wrap=5,
            fill=True);



In [None]:
g = sns.FacetGrid(df_eda, col='supportSegment', col_wrap=5)
g.map(sns.histplot, 'ltr', kde=True)
plt.show()

### Analisis de correlación

In [None]:
correlation_matrix = df_eda.corr()
print(correlation_matrix)

In [None]:
sns.heatmap(df_eda.corr(), annot=True, cmap=sns.cubehelix_palette(as_cmap=True),square=True, linewidth=0.5);


##### No se observan correlacion entre las variables de análisis en este caso LTR, MTTR, assignToDisp, ttrGross y slaYield

In [None]:
g = sns.FacetGrid(df_eda, col='slaYield', col_wrap=3)
g.map(sns.histplot, 'ltr', kde=True)
plt.show()

In [None]:
sns.set_theme(style="ticks")

# Crear una columna de fecha
df_eda['fecha'] = pd.to_datetime(df_eda['ResponseDate'])

# Agregar una columna de mes y año
df_eda['mes-año'] = df_eda['fecha'].dt.strftime('%m-%Y')

# Crear una lista de meses ordenados
order = ['10-2022', '11-2022', '12-2022', '01-2023', '02-2023', '03-2023']

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(20, 6))

# Plot the orbital period with horizontal boxes
sns.boxplot(x="mes-año", y="ltr", data=df_eda, order=order,
            whis=[0, 100], width=.6, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="mes-año", y="ltr", data=df_eda, order=order,
              size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)




#### Analisis de boxplot 


In [None]:
sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(20, 6))
#ax.set_xscale("log")


# Plot the orbital period with horizontal boxes
sns.boxplot(x="supportSegment", y="ltr", data=df_eda,
            whis=[0, 100], width=.6, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="supportSegment", y="ltr", data=df_eda,
              size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

In [None]:
df_eda['escalation'] = pd.Categorical(df_eda['escalation'], categories=['Level 1', 'Level 2', 'TCAM', 'Level 4'])

sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(20, 6))
#ax.set_xscale("log")

# Order the levels in the desired order
df['escalation'] = pd.Categorical(df_eda['escalation'], categories=['Level 1', 'Level 2', 'TCAM', 'Level 4'])

# Plot the orbital period with horizontal boxes
sns.boxplot(x="escalation", y="ltr", data=df_eda,
            whis=[0, 100], width=.3, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="escalation", y="ltr", data=df_eda,
              size=4, color=".1", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)



In [None]:
sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(20, 6))
#ax.set_xscale("log")


# Plot the orbital period with horizontal boxes
sns.boxplot(x="slaYield", y="ltr", data=df_eda,
            whis=[0, 100], width=.2, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="slaYield", y="ltr", data=df_eda,
              size=3, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

In [None]:
# Histograma para la variable LTR
sns.displot( x="ltr", data=df_eda, hue='slaYield', kind="kde", fill=True);
plt.show()

### Analisis de series de tiempo

In [None]:
# Convertir la columna de fecha en el índice del DataFrame
df_eda.index = pd.to_datetime(df_eda['ResponseDate'])

# Ordenar los datos por fecha
df_eda = df_eda.sort_index()

# Agrupar los datos por mes y calcular la tasa de éxito promedio y la desviación estándar para cada mes
monthly_stats = df_eda.groupby(pd.Grouper(freq='M'))['ltr'].agg(['mean', 'std'])

# Crear una figura con dos subplots: uno para la serie de tiempo de la tasa de éxito promedio y otro para la desviación estándar
fig, axs = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

# Graficar la serie de tiempo de la tasa de éxito promedio y la desviación estándar en sus respectivos subplots
axs[0].plot(monthly_stats['mean'], color='blue')
axs[0].set_ylabel('Tasa de éxito promedio')
axs[1].plot(monthly_stats['std'], color='red')
axs[1].set_ylabel('Desviación estándar')
axs[1].set_xlabel('Fecha')

plt.show()



### Análisis de agrupamiento con el algoritmo de clustering K-Means 

##### Definincion del Numero de Cluster

In [None]:
### Cargamos las librerias para calcular el numero de cluster del modelo
from scipy.spatial import distance as sci_distance
from sklearn import cluster as sk_cluster

In [None]:
### Filtramos un nuevo dataset de las variables numericas continuas
derived_df = df.filter(["mttr", "ttrGros", "ltr"])

In [None]:
### Aplicamos el algoritmo para determinar el numero optimo de cluster

cdata = derived_df 
K = range(1, 20)
KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
centroids = (k.cluster_centers_ for k in KM)

D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
dist = (np.min(D, axis=1) for D in D_k)
avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
plt.plot(K, avgWithinSS, 'b*-')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering')
plt.show()

In [None]:
df_cluster = df

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Seleccionar las variables a utilizar
variables = ["ltr", "ttrGros"]

# Filtrar el dataset por las variables seleccionadas
X = df_cluster[variables]

# Codificar las variables categóricas
le = LabelEncoder()
categorical_vars = [ "caseType", "severity", "level3Detected", "escalation", "fixType", "rfoEnabled", "closeType", "clarifyName", 'assignToDisp' , "serviceid", "supportSegment", "nps_group"]
for var in categorical_vars:
    X[var] = le.fit_transform(df_cluster[var].astype(str))

# Escalar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Definir el modelo de clustering
kmeans = KMeans(n_clusters=5, random_state=111)

# Entrenar el modelo
kmeans.fit(X_scaled)

# Agregar la columna con la etiqueta del cluster asignado a cada registro
df["cluster_label"] = kmeans.labels_

# Calcular la tasa de éxito por cluster
success_rate = df_cluster.groupby("cluster_label")["ltr"].mean()

# Imprimir los resultados
print("Tasa de éxito por cluster:\n", success_rate)


In [None]:
df_cluster['cluster_label'] = kmeans.labels_
grupos = df_cluster.groupby('cluster_label')
for label, grupo in grupos:
    print('Grupo {}: {} proyectos'.format(label, len(grupo)))
    print(grupo[["caseType", "severity", "level3Detected", "escalation", "fixType", "rfoEnabled", "closeType", "clarifyName", 'assignToDisp' , "serviceid", "supportSegment", "nps_group"]].head())


In [None]:
df_cluster.to_csv("cluster.csv")

df_cluster.head()


In [None]:
sns.countplot(x="cluster_label", data=df_cluster)

In [None]:
sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(20, 6))
#ax.set_xscale("log")


# Plot the orbital period with horizontal boxes
sns.boxplot(x="cluster_label", 
            y="ltr", 
            data=df_cluster,
            whis=[0, 100], 
            width=.6, 
            palette="vlag")

# Add in points to show each observation
sns.stripplot(x="cluster_label", 
              y="ltr", 
              data=df_cluster,
              size=4, 
              color=".3", 
              linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

### Linear Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
df_ml = df

In [None]:
df_ml.head()

In [None]:
# Eliminar columnas no relevantes
df_ml.drop(['ResponseDate' , 'caseID', 'slaYield', 'caseType', 'severity',  'level3Detected', 'escalation', 'fixType', 'rfoEnabled', 'closeType', 'clarifyName', 'ownerWorkgroup', 'serviceid', 'supportSegment', 'nps_group'], axis=1, inplace=True)


In [None]:
df_ml

In [None]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X = df_ml.drop('ltr', axis=1)
y = df_ml['ltr']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Imputar los valores faltantes en el conjunto de entrenamiento
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_train)
X_train = imputer.transform(X_train)

# Escalar las variables de entrada
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Crear un objeto de regresión lineal y ajustar los datos de entrenamiento
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Imputar los valores faltantes en el conjunto de prueba
X_test = imputer.transform(X_test)

# Escalar las variables de entrada del conjunto de prueba
X_test = scaler.transform(X_test)

# Hacer predicciones con el conjunto de prueba y evaluar el modelo
y_pred = regressor.predict(X_test)

In [None]:
y_pred

### Predicción de LTR

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Cargar el modelo ya entrenado
model = load_model('nombre_del_modelo_entrenado.h5')

# Cargar los datos de prueba
datos = np.loadtxt('ruta/datos_de_prueba.csv', delimiter=',')

# Escalar los datos
scaler = MinMaxScaler()
datos_escalados = scaler.fit_transform(datos)

# Realizar las predicciones
predicciones = model.predict(datos_escalados)

# Desescalar las predicciones
predicciones_desescaladas = scaler.inverse_transform(predicciones)

# Imprimir las predicciones
print(predicciones_desescaladas)
