In [235]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
import operator
import os

## DATA

In [236]:
csv_path = r"C:\Users\sofia.martinez_bluet\Downloads\train.csv"

# Verificar si el archivo existe en la ruta especificada
if os.path.exists(csv_path):
    # Leer el archivo CSV
    data = pd.read_csv(csv_path, na_values=["nan"])
    # Mostrar las primeras filas del DataFrame para verificar
    print(data.head())
else:
    print(f"El archivo {csv_path} no se encuentra en la ruta especificada.")

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [237]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [238]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [239]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Preprocessing

In [240]:
# DELETION OF IRRELEVANT COLUMNS
data = data.drop(columns=["Name", "Ticket","Cabin"])

#### Missing data Treatment

In [241]:
# MISSING DATA TREATMENT
data.isnull().sum()

# Age is quantitative
# Embarked is qualitative - because it is only 2 we can erase them

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [242]:
#### QUANTITATIVE VARIABLES ####

## posible imputation methods

# imputation_methods = {
#     "Mean": SimpleImputer(strategy='mean'),
#     "Median": SimpleImputer(strategy='median'),
#     "Mode": SimpleImputer(strategy='most_frequent'),
#     "KNN": KNNImputer(n_neighbors=2),
#     "Iterative": IterativeImputer(random_state=0)
# }

# ----------------------------------------------------------------
from sklearn.impute import KNNImputer

numeric_columns = ['Age']
# Extraer las columnas numéricas
df_numeric = data[numeric_columns]

# Instantiate the KNNImputer
imputer = KNNImputer(n_neighbors=2)

# Fit and transform the data
df_numeric_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=numeric_columns)

data['Age'] = df_numeric_imputed['Age']

In [243]:
### QUALITATIVE VARIABLES ####

## posible imputation methods

# imputation_methods = {
#     "Most_frequent": SimpleImputer(strategy='most_frequent'),
#     "Constant": SimpleImputer(strategy='constant', fill_value='Unknown'),
# }

# 3. Predictive Imputation using KNN (Example with one column for simplicity)
    # Encode the categorical values to numerical for the KNN model
    # Separate the rows with missing values and those without
    # Use KNN to predict missing 'Seasons' values
    # Predict the missing values
    # Fill other columns similarly (not shown for brevity)

In [244]:
# delete 2 rows of missing data for the column Embarked
data = data.dropna(subset=["Embarked"])

#### One Hot Encoder

In [245]:
df_encoded = pd.get_dummies(data, columns=['Sex'], prefix=['Sex'])
data = pd.get_dummies(df_encoded, columns=['Embarked'], prefix=['Embarked'])


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,2,1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.000000,0,0,7.9250,1,0,0,0,1
3,4,1,1,35.000000,1,0,53.1000,1,0,0,0,1
4,5,0,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,0,1,0,0,1
887,888,1,1,19.000000,0,0,30.0000,1,0,0,0,1
888,889,0,3,29.699118,1,2,23.4500,1,0,0,0,1
889,890,1,1,26.000000,0,0,30.0000,0,1,1,0,0


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
## COMPLEX MODEL
# In case we want to add hyperparameter tuning and Cross-validation we can add it like this, 
# we have to add it to the basic model without the previoud fit - 
# WE HAVE TO ERASE LINE: LR_model = model_p.fit(X=X_train0, y=y_train0)

model = LogisticRegression()

model_p = Pipeline ([
    ( 'preprocessor', pipeline ),
    ('classifier', model)])

#### HYPER-PARAMETERS  #######
# ranges for gamma and C
param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 1.0, 10.0, 100],
        'classifier__max_iter': [100, 1000, 10000]}

# Inner CV
cv = KFold(n_splits=3, shuffle=True, random_state=100483869)


# instantiation of the grid of hyperparameters that will be searched with cross validation

grid_search = GridSearchCV(model_p, #model
                   param_grid, # rango of value of the hyper-parameter to evaluate
                   scoring='f1',
                   cv=cv,
                   n_jobs=-1,
                   verbose=3) #-1 means use all processors



#START OF THE RECORD
LR_model_grid = grid_search.fit(X=X_train0, y=y_train0)