In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv("tested.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
def cleaner(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype=="object":
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].mean(), inplace=True)
    return df 

In [5]:
df = cleaner(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         418 non-null    float64
 10  Cabin        418 non-null    object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [7]:
def encodlash(df):
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()    
    for col in df.columns:
        if df[col].dtype=="object":
            if df[col].nunique()<=6:
                dummies = pd.get_dummies(df[col], prefix="col", dtype=int)
                df = pd.concat([df.drop(columns=col), dummies,], axis = 1)
            else:
                encoder = LabelEncoder()
                df[col] = encoder.fit_transform(df[col])
    return df 

In [8]:
df = encodlash(df)

In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,col_female,col_male,col_C,col_Q,col_S
0,892,0,3,206,34.5,0,0,152,7.8292,15,0,1,0,1,0
1,893,1,3,403,47.0,1,0,221,7.0,15,1,0,0,0,1
2,894,0,2,269,62.0,0,0,73,9.6875,15,0,1,0,1,0
3,895,0,3,408,27.0,0,0,147,8.6625,15,0,1,0,0,1
4,896,1,3,178,22.0,1,1,138,12.2875,15,1,0,0,0,1


In [10]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    int64  
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    int64  
 8   Fare         418 non-null    float64
 9   Cabin        418 non-null    int64  
 10  col_female   418 non-null    int64  
 11  col_male     418 non-null    int64  
 12  col_C        418 non-null    int64  
 13  col_Q        418 non-null    int64  
 14  col_S        418 non-null    int64  
dtypes: float64(2), int64(13)
memory usage: 49.1 KB


In [11]:
def scalinglash(df):
    from sklearn.preprocessing import StandardScaler
    standart_scaler = StandardScaler()
    num_col = df.select_dtypes(include=["float64", "int64"])
    df[num_col]= standart_scaler.fit_transform(df[num_col])
    return df

In [12]:
df = scalinglash(df)

ValueError: Boolean array expected for the condition, not int64

In [13]:
x = df.drop("Survived", axis=1)
y = df['Survived']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Algoritm Decision Tree Classifier  


In [16]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()

In [17]:
model_tree

In [18]:
model_tree.fit(x_train, y_train)

In [19]:
y_pred = model_tree.predict(x_test)
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1])

In [20]:
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import cross_val_score, KFold

In [21]:
score = accuracy_score(y_test, y_pred)
score

1.0

In [22]:
kn= KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model_tree, x, y, cv=kn, scoring="accuracy")
print(scores)

[1. 1. 1. 1. 1.]


In [23]:
print(np.mean(scores). round(2))

1.0


In [24]:
np.std(scores)

np.float64(0.0)

# Tuning 

# Hyperparameter

In [25]:
dt_params={
    'max_depth':[2,4,6,8,10],
    'criterion':['gini','entropy'],
    'min_samples_split':[2,5,10]
}

In [26]:
from sklearn.model_selection import GridSearchCV
dt = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(estimator=dt, param_grid=dt_params, cv=5, scoring='accuracy')
grid_dt.fit(x_train, y_train)

best_dt = grid_dt.best_estimator_


y_pred_dt = best_dt.predict(x_test)
print("Decision Tree Best Parameters:", grid_dt.best_params_)

Decision Tree Best Parameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}


In [27]:
score=accuracy_score(y_test,y_pred_dt)
score

1.0

# Algoritm Random Forest Classifier 

In [28]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()

In [29]:
model_rf.fit(x_train, y_train)

In [30]:
y_pred = model_rf.predict(x_test)
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1])

In [31]:
score = accuracy_score(y_test, y_pred)
score

1.0

In [32]:
kn= KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model_rf, x, y, cv=kn, scoring="accuracy")
print(scores)

[1. 1. 1. 1. 1.]


In [33]:
print(np.mean(scores). round(2))

1.0


In [34]:
np.std(scores)

np.float64(0.0)

# Algoritm KNN Classifier

In [35]:
from sklearn.neighbors import KNeighborsClassifier
model_KNN = KNeighborsClassifier(n_neighbors=5)

In [36]:
model_KNN.fit(x_train, y_train)

In [37]:
y_pred = model_KNN.predict(x_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1])

In [38]:
scoreKNN = accuracy_score(y_test, y_pred)
scoreKNN

0.5714285714285714

In [39]:
kn= KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model_KNN, x, y, cv=kn, scoring="accuracy")
print(scores)

[0.57142857 0.6547619  0.60714286 0.61445783 0.53012048]


In [40]:
print(np.mean(scores). round(2))

0.6


In [41]:
np.std(scores)

np.float64(0.042097054560309516)

# Algoritm Logistic Regression

In [42]:
from sklearn.linear_model  import  LogisticRegression
modelLOG_REg= LogisticRegression()

In [43]:
modelLOG_REg.fit(x_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
y_pred = modelLOG_REg.predict(x_test)
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1])

In [45]:
scoreLog_Reg = accuracy_score(y_test, y_pred)
scoreLog_Reg

1.0

In [46]:
kn= KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(modelLOG_REg, x, y, cv=kn, scoring="accuracy")
print(scores)

[1.         0.98809524 1.         1.         1.        ]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [47]:
print(np.mean(scores). round(2))

1.0


In [48]:
np.std(scores)

np.float64(0.004761904761904744)