In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score


In [None]:
#cargamos todos los datos

In [None]:
X_train = pd.read_csv('train.csv')
y_train = X_train[['Survived']]
X_train = X_train.drop(['Survived'] , axis = 1)

In [None]:
#cargamos los datos de test

X_test = pd.read_csv('test.csv')


In [None]:
X_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
#analizamos los valores missing

X_train.isna().sum() #cabin tiene muchisimos missing, es mejor quitarla

X_train = X_train.drop(['Cabin'] , axis = 1)
X_test = X_test.drop(['Cabin'] , axis = 1)

In [None]:
X_train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

### Probamos a imputar la edad con la media

In [None]:
X_train[['Age']] = X_train[['Age']].fillna(X_train[['Age']].mean())
media = int(X_train[['Age']].mean())
print(f'la edad media con la que se ha rellenado es de {media} años')

la edad media con la que se ha rellenado es de 29 años


In [None]:
X_train.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [None]:
#para rellenar los dos que nos faltan podemos imputarlo por el más comun o usar knn imputer
X_train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
X_train.fillna('S' , inplace = True)

In [None]:
X_train.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Embarked     891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


# Ahora ya tenemos la base de datos con todos los valores rellenos,
## Podemos empezar el preprocesado

In [None]:
oh = OneHotEncoder(sparse = False)
X_train_encoded = pd.DataFrame(oh.fit_transform(X_train[['Pclass','Sex' , 'Embarked']]) , columns = oh.get_feature_names_out())



In [None]:
X_train = pd.concat([X_train , X_train_encoded] , axis = 1)

In [None]:
X_train.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [None]:
X_test.isna().sum()
X_test['Age'] = X_test['Age'].fillna(X_test.Age.mean())

In [None]:
X_test.isna().sum()
X_test.Embarked.value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [None]:
#también los cambiamos por S
X_test.Embarked = X_test.Embarked.fillna('S')

In [None]:
X_test.isna().sum()


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
dtype: int64

In [None]:
#hacemos lo mismo para los datos de test
X_test_encoded = pd.DataFrame(oh.transform(X_test[['Pclass','Sex' , 'Embarked']]) , columns = oh.get_feature_names_out())
X_test = pd.concat([X_test , X_test_encoded] , axis = 1)
X_test.fillna(X_test.mean() , inplace = True)

  X_test.fillna(X_test.mean() , inplace = True)


In [None]:
X_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

## Ahora que ya tenemos los datos preprocesados solo queda quedarnos con las variables independientes

In [None]:
indep = ['Pclass_1', 'Pclass_2', 'Pclass_3' , 'Age' , 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S' ]

In [None]:
X_train = X_train[indep]
X_test = X_test[indep]

In [None]:
X_train.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.0,0.0,1.0,22.0,1,0,7.25,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,38.0,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,26.0,0,0,7.925,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,35.0,1,0,53.1,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,35.0,0,0,8.05,0.0,1.0,0.0,0.0,1.0


#Ajustamos el modelo

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
y_train = np.array(y_train).ravel()

In [None]:
lr = LogisticRegression()
feature_names = X_train.columns.tolist()
modelo = lr.fit(X_train_scaled, y_train)

In [None]:
y_pred.shape
y_test.shape

(891, 1)

#Una vex tenemos el modelo evaluamos el rendimiento

In [None]:
y_pred = modelo.predict(X_test_scaled)

In [None]:
y_pred_train = modelo.predict(X_train_scaled)
acc = accuracy_score(y_train , y_pred_train)

In [None]:
print(f'la accuracy del modelo es del: {acc*100:.2f} %')

la accuracy del modelo es del: 80.13 %


In [None]:
X_test_scaled.shape

(418, 12)

In [None]:
y_pred.shape

(418,)

In [None]:
PID = pd.DataFrame(pd.read_csv('test.csv').PassengerId)

In [None]:
results = pd.concat([PID , pd.DataFrame(y_pred , columns = ['Survived'])] , axis = 1)

In [None]:
results

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
pd.read_csv('test.csv')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
results.to_csv('results.csv' , index = False)