## GRUPO 5 

INTEGRANTES: JUSTO U, AGUSTINA P, LUCAS N, CINTIA G, RODRIGO M, MATIAS S, CLARA S

In [None]:
# Imports de librerías
# Seteo de algunos paramteros
# Import de Datos

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, r2_score, mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

pd.options.display.max_columns = 30

loc = "../input/airline-passenger-satisfaction/train.csv"
data = pd.read_csv(loc)


## Algunas vistas de los datos ##

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#Limpieza de valores NULOS
data.isnull().sum()
data[data['Departure Delay in Minutes'].isnull()]
data.dropna(inplace=True)

In [None]:
#Dropeo de columnas irrelevantes
data.drop(columns=['id','Unnamed: 0'],inplace=True)

In [None]:
data.columns=['gender', 'customer_type', 'age', 'type_travel',
       'class', 'flight_distance', 'inflight_wifi_service',
       'dep/arr_time_convenient', 'ease_online_booking',
       'gate_location', 'food_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'onboard_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight service',
       'cleanliness', 'departure_delay_minutes', 'arrival_delay_minutes',
       'satisfaction']

In [None]:
data.sample(10)

#### Verificamos que las clases no estén desbalanceadas ####

In [None]:
data['satisfaction'].value_counts(normalize=True).round(3)

In [None]:
#Cambio Algunas columnas categóricas por ordinales

data['class'] = data['class'].replace({'Eco':0,'Eco Plus':1,'Business':2})
data['satisfaction']=data['satisfaction'].replace({'neutral or dissatisfied':0,'satisfied':1})

In [None]:
#Generación de dummies para features categóricas

loyal_cust = pd.get_dummies(data['customer_type']).drop(columns='disloyal Customer')
data = pd.concat([data,loyal_cust],axis=1)
data = data.drop(columns='customer_type')

travel_type = pd.get_dummies(data['type_travel']).drop(columns='Personal Travel')
data = pd.concat([data,travel_type],axis=1)
data = data.drop(columns='type_travel')

genre = pd.get_dummies(data['gender']).drop(columns='Female')
data = pd.concat([data,genre],axis=1)
data = data.drop(columns='gender')

In [None]:
data['arrival_delay_minutes']=data['arrival_delay_minutes'].astype('int')

In [None]:
plt.figure(figsize=(8,12))
sns.heatmap(data.corr().sort_values('satisfaction')[['satisfaction']], annot=True)

In [None]:
data = data.drop(columns=['Male','departure_delay_minutes','arrival_delay_minutes','dep/arr_time_convenient','gate_location'])

In [None]:
data.columns

In [None]:
data.columns = ['age', 'class', 'flight_distance', 'inflight_wifi_service',
       'ease_online_booking', 'food_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'onboard_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'satisfaction', 'loyal_customer', 'business_travel']

In [None]:
fig = plt.figure(figsize=(12, 5), dpi=90)

sns.set_style('whitegrid')
sns.set_palette('Set1', 8, 1)
sns.kdeplot(data=data, x='class', shade=True, clip=(0, 8000)) #shade=true, pinta el área bajo la curva

plt.title('DENSIDAD DE PASAJEROS POR CLASE', fontdict={'fontsize': 16})
plt.show()

In [None]:
data.to_csv('Clean_train.csv')

In [None]:
X_train = data.drop(columns='satisfaction')
y_train = data['satisfaction']

### Estandarizamos

In [None]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

## Trabajamos los datos de test

In [None]:
loc_test = "../input/airline-passenger-satisfaction/test.csv"
data_test = pd.read_csv(loc_test)

#### Verificamos que las clases no estén desbalanceadas ####

In [None]:
data_test['satisfaction'].value_counts(normalize=True).round(4)

In [None]:
data_test.isnull().sum()
data_test[data_test['Departure Delay in Minutes'].isnull()]
data_test.dropna(inplace=True)

data_test.drop(columns=['id','Unnamed: 0'],inplace=True)

In [None]:
data_test.columns=['gender', 'customer_type', 'age', 'type_travel',
       'class', 'flight_distance', 'inflight_wifi_service',
       'dep/arr_time_convenient', 'ease_online_booking',
       'gate_location', 'food_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'onboard_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'departure_delay_minutes', 'arrival_delay_minutes',
       'satisfaction']

In [None]:
data_test.info()

In [None]:
data_test['satisfaction'].value_counts()

In [None]:
data_test['class'].value_counts()

In [None]:
data_test['class'] = data_test['class'].replace({'Eco':0,'Eco Plus':1,'Business':2})
data_test['satisfaction']=data_test['satisfaction'].replace({'neutral or dissatisfied':0,'satisfied':1})

In [None]:
loyal_cust = pd.get_dummies(data_test['customer_type']).drop(columns='disloyal Customer')
data_test = pd.concat([data_test,loyal_cust],axis=1)
data_test = data_test.drop(columns='customer_type')

travel_type = pd.get_dummies(data_test['type_travel']).drop(columns='Personal Travel')
data_test = pd.concat([data_test,travel_type],axis=1)
data_test = data_test.drop(columns='type_travel')

genre = pd.get_dummies(data_test['gender']).drop(columns='Female')
data_test = pd.concat([data_test,genre],axis=1)
data_test = data_test.drop(columns='gender')

data_test['arrival_delay_minutes']=data_test['arrival_delay_minutes'].astype('int')

data_test = data_test.drop(columns=['Male','departure_delay_minutes','arrival_delay_minutes','dep/arr_time_convenient','gate_location'])

In [None]:
data_test.to_csv('Clean_test.csv')

In [None]:
X_test = data_test.drop(columns='satisfaction')
y_test = data_test['satisfaction']

In [None]:
X_test_std = scaler.transform(X_test)

## Creamos tres modelos ##

Con RandomizedCV acotamos los parámetros

In [None]:
models = [LogisticRegression(),
          KNeighborsClassifier(),
          RandomForestClassifier()]

params = [
    {'C': [1, 10, 100, 1000],
     'penalty': ['l1', 'l2',],
     'solver': ['saga']},
    {'n_neighbors': range(1,20),
     'weights' : ['uniform', 'distance'],
     'p' : [1, 2, 3]}, 
    {'n_estimators': range(25,200,25),
     'criterion': ['gini', 'entropy',],
     'class_weight': ['balanced', 'balanced_subsample']}]

folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)

In [None]:
random = []
for i in range(len(models)):
    rn = RandomizedSearchCV(estimator=models[i], param_distributions=params[i], scoring='f1', cv=folds, 
                         n_iter=8, n_jobs = 4, random_state=10, return_train_score=True)
    print (rn)
    fit = rn.fit(X_train_std, y_train)
    random.append(fit)

In [None]:
for i in random:
    print (i.best_score_)
    print (i.best_estimator_)
    print (i.best_params_)

Los parámetros óptimos son los siguientes:

In [None]:
LR = LogisticRegression(C=10, penalty='l1', solver='saga')
KNN = KNeighborsClassifier(n_neighbors=10, p=1, weights='distance')
RF = RandomForestClassifier(class_weight='balanced', criterion='entropy', n_estimators=175)

In [None]:
LR.fit(X_train_std, y_train)
KNN.fit(X_train_std, y_train)
RF.fit(X_train_std, y_train)

y_pred_lr = LR.predict(X_test_std)
y_pred_knn = KNN.predict(X_test_std)
y_pred_rf = RF.predict(X_test_std)

In [None]:
print('F1 Score LR       =  ' , f1_score(y_test, y_pred_lr).round(4))
print('F1 Score KNN      =  ' , f1_score(y_test, y_pred_knn).round(4))
print('F1 Score RF       =  ' , f1_score(y_test, y_pred_rf).round(4))

## Ensamble

In [None]:
ensamble_1 = VotingClassifier(estimators=[('rf', RF), ('knn', KNN)], voting='hard')
ensamble_1 = ensamble_1.fit(X_train_std, y_train)

ensamble_2 = VotingClassifier(estimators=[('rf', RF), ('knn', KNN)],voting='soft')
ensamble_2 = ensamble_2.fit(X_train_std, y_train)

ensamble_3 = VotingClassifier(estimators=[('rf', RF), ('knn', KNN)], voting='soft', weights=[2,1], flatten_transform=True)
ensamble_3 = ensamble_3.fit(X_train_std, y_train)

In [None]:
y_pred_ensamble_1 = ensamble_1.predict(X_test_std)
y_pred_ensamble_2 = ensamble_2.predict(X_test_std)
y_pred_ensamble_3 = ensamble_3.predict(X_test_std)

In [None]:
print('F1 Score de los ensambles:')
print('F1 Score Ensamble_1       =  ' , f1_score(y_test, y_pred_ensamble_1).round(4))
print('F1 Score Ensamble_2       =  ' , f1_score(y_test, y_pred_ensamble_2).round(4))
print('F1 Score Ensamble_3       =  ' , f1_score(y_test, y_pred_ensamble_3).round(4))
print('---------------------------------------------------------------')
print('F1 Score de RF (el modelo que mejor performó individualmente) es:')
print('F1 Score RF               =  ' , f1_score(y_test, y_pred_rf).round(4))

#### De todas las pruebas que hicimos hasta ahora el que mejor performó es RandomForestClassifier(class_weight='balanced', criterion='entropy', n_estimators=175).
#### A partir de GridSearch buscamos optimizar los hiperparámetros de dicho modelo.

In [None]:
params = [
    {'n_estimators': range(150,185,5),
     'criterion': ['gini', 'entropy',],
     'class_weight': ['balanced']}]

folds=StratifiedKFold(n_splits=10, random_state=19, shuffle=True)

In [None]:
gs = GridSearchCV(estimator=RF, param_grid=params, scoring='f1', cv=folds)
print (gs)
fit = gs.fit(X_train_std, y_train)

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
best_model = gs.best_estimator_

In [None]:
y_pred_gs = gs.best_estimator_.predict(X_test_std)

In [None]:
print('F1 Score de RF con GridSearchCV es            =  ' , f1_score(y_test, y_pred_gs).round(4))
print('---------------------------------------------------------------')
print('F1 Score de RF con RandomizedSearchCV era de  =  ' , f1_score(y_test, y_pred_rf).round(4))
print('---------------------------------------------------------------')
print('Los hirparámetros obtenidos con Grid mejoran el F1 Score')

### Nos quedamos con el modelo RandomForestClassifier(class_weight='balanced', criterion='entropy', n_estimators=180)

#### Probamos con PCA

In [None]:
pca = PCA() 
X_train_std_pca = pca.fit_transform(X_train_std) 

RF_pca = RandomForestClassifier(class_weight='balanced', criterion='entropy',  n_estimators=180)
RF_pca.fit(X_train_std_pca, y_train)

X_test_std_pca = pca.transform(X_test_std) 

y_pred_test_rf_pca = RF_pca.predict(X_test_std_pca)

print(classification_report(y_test, pd.Series(y_pred_test_rf_pca), digits=4))

Vemos que PCA tampoco mejora respecto al modelo RandomForestClassifier(class_weight='balanced', criterion='entropy', n_estimators=180)

In [None]:
cm = confusion_matrix(y_test, y_pred_gs)

In [None]:
sns.heatmap(cm, annot=True, fmt='4d')
plt.xlabel('Predichos')
plt.ylabel('Reales')

In [None]:
specificity = cm[0,0]/(cm[0,0]+cm[0,1])
print('Accuracy=      ', accuracy_score(y_test, y_pred_gs).round(4))
print('Recall=        ', recall_score(y_test, y_pred_gs).round(4))
print('Precision=     ', precision_score(y_test, y_pred_gs).round(4))
print('Specificity =  ', specificity.round(4))
print('F1 Score=      ' , f1_score(y_test, y_pred_gs).round(4))

In [None]:
print(classification_report(y_test, y_pred_gs, digits=4))

# CASO DE NEGOCIO

Nuestro cliente está interesado en ofrecerle un beneficio en su próximo vuelo a quienes estén insatisfechos para lograr cambiar la percepción que tienen de la compañía. 

Teniendo en cuenta que este beneficio le cuesta dinero a la aerolinea, nos solicitan focalizarnos en **encontrar los clientes realmente insatisfechos para no incurrir en gastos innecesarios**. 

De acuerdo a su planteo, definimos reducir los falsos negativos (FN), es decir, los casos satisfechos incorrectamente clasificados como no satisfechos. 
Sabemos que reducir los FN implica sacrificar precisón para poder **incrementar la sensibilidad (recall)**. 

In [None]:
y_test_pred_25 = (best_model.predict_proba(X_test_std)[:,1] > 0.25).astype('int')

In [None]:
cm_25 = confusion_matrix(y_test, y_test_pred_25)

In [None]:
sns.heatmap(cm_25, annot=True, fmt='4d')
plt.xlabel('Predichos')
plt.ylabel('Reales')

In [None]:
print(' ')
print('CASO ORIGINAL')
print('---------------------')
print('Accuracy       =  ', accuracy_score(y_test, y_pred_gs).round(4))
print('Recall         =  ', recall_score(y_test, y_pred_gs).round(4))
print('Precision      =  ', precision_score(y_test, y_pred_gs).round(4))
print('F1 Score       =  ' , f1_score(y_test, y_pred_gs).round(4))
print(' ')
print(' ')

print('MODIFICADO CASO NEGOCIO')
print('------------------------')
print('Accuracy_25    =  ', accuracy_score(y_test, y_test_pred_25).round(4))
print('Recall_25      =  ', recall_score(y_test, y_test_pred_25).round(4))
print('Precision_25   =  ', precision_score(y_test, y_test_pred_25).round(4))
print('F1 Score_25    =  ' , f1_score(y_test, y_test_pred_25).round(4))
print(' ')

In [None]:
print(classification_report(y_test, y_test_pred_25, digits=4))

##### Vemos que los falsos negativos disminuyeron de 677 a 307, el recall aumentó 3.3 ptos y la precisión disminuyó 7.3

Tal como solicitó nuestro cliente, si bien al bajar el umbral estamos clasificando erróneamente a más clientes insatisfechos como satisfechos (sobre los que la cía no va a accionar) 
logramos que la aerolínea identifique mejor los casos realmente insatisfechos para no entrar en gastos innecesarios de acuerdo a su campaña. 