In [58]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

Dataset Selection

In [59]:
# Titanic Dataset
titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data Preprocessing

In [60]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [61]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [62]:
# data cleaning
# fill missing values in age column using KNNImputer
age_col = titanic[['Age']]

# apply k-Nearest Neighbor imputer to handle missing values
imputer = KNNImputer(n_neighbors=3)
data_imputed = pd.DataFrame(imputer.fit_transform(age_col), columns=['Age'])

# replace null values with imputed values
titanic['Age'] = data_imputed['Age']
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [63]:
# Check for duplicates
titanic.duplicated().sum()

0

In [64]:
# Encoding categorical data
le = LabelEncoder()
titanic['Sex_encod'] = le.fit_transform(titanic['Sex'])  

In [65]:
# Feature scaling
scaler = StandardScaler()
titanic['Age_scaled'] = scaler.fit_transform(titanic[['Age']])

In [66]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encod,Age_scaled
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,-0.592481
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0.638789
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,-0.284663
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0.407926
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,0.407926


In [67]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_encod',
       'Age_scaled'],
      dtype='object')

Model Selection

In [68]:
# Split the dataset into train and test datasets
X = titanic[['Pclass', 'SibSp', 'Parch', 'Sex_encod', 'Age_scaled']]
y = titanic['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Model Training

In [69]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [70]:
# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [71]:
# Model Predictions
# Logistic Regression
log_reg_predictions = log_reg.predict(X_test)
# Decision Tree
dtree_predictions = dtree.predict(X_test)

Model Evaluation

In [72]:
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
dtree_accuracy = accuracy_score(y_test, dtree_predictions)
print('Logistic Regression Model Evaluation Report:\n')
print("Accuracy:", log_reg_accuracy)
print(classification_report(y_test,log_reg_predictions))
print('Decision Tree Model Evaluation Report:\n')
print("Accuracy:", dtree_accuracy)
print(classification_report(y_test,dtree_predictions))

Logistic Regression Model Evaluation Report:

Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       110
           1       0.74      0.75      0.75        69

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.81      0.80      0.80       179

Decision Tree Model Evaluation Report:

Accuracy: 0.7877094972067039
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       110
           1       0.75      0.67      0.71        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.78       179



Interpretation of Results

At accuracy of 0.8 the logistic regression model correctly predicts survivals 80% of the time, whereas, the dexision tree predixts it slightly less at 78% with accuracy of 0.78. Therefore, the logistic regression performs slightly better making it a more reliable model.

Dataset Selection

In [87]:
# Iris Dataset
iris = pd.read_csv('iris.csv')
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


Data Preprocessing

In [88]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [89]:
iris.isnull().sum()

sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

In [90]:
# Check for duplicates
iris.duplicated().sum()

1

In [91]:
# Drop duplicated
iris.drop_duplicates(inplace=True)
iris.duplicated().sum()

0

In [92]:
# Encoding categorical data
le = LabelEncoder()
iris['variety_encod'] = le.fit_transform(iris['variety'])  

In [93]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,variety_encod
0,5.1,3.5,1.4,0.2,Setosa,0
1,4.9,3.0,1.4,0.2,Setosa,0
2,4.7,3.2,1.3,0.2,Setosa,0
3,4.6,3.1,1.5,0.2,Setosa,0
4,5.0,3.6,1.4,0.2,Setosa,0


In [95]:
iris['variety_encod'].unique()

array([0, 1, 2])

In [94]:
iris.columns

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'variety',
       'variety_encod'],
      dtype='object')

Model Selection

In [97]:
# Split the dataset into train and test datasets
X = iris[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']]
y = iris['variety_encod']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Model Training

In [98]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [99]:
# Decision Tree
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [100]:
# Model Predictions
# Logistic Regression
log_reg_predictions = log_reg.predict(X_test)
# Decision Tree
dtree_predictions = dtree.predict(X_test)

Model Evaluation

In [101]:
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
dtree_accuracy = accuracy_score(y_test, dtree_predictions)
print('Logistic Regression Model Evaluation Report:\n')
print("Accuracy:", log_reg_accuracy)
print(classification_report(y_test,log_reg_predictions))
print('Decision Tree Model Evaluation Report:\n')
print("Accuracy:", dtree_accuracy)
print(classification_report(y_test,dtree_predictions))

Logistic Regression Model Evaluation Report:

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00         8

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Decision Tree Model Evaluation Report:

Accuracy: 0.9666666666666667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.91      1.00      0.95        10
           2       1.00      0.88      0.93         8

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.96        30
weighted avg       0.97      0.97      0.97        30



Interpretation of Results

At accuracy of 1.0 the logistic regression model correctly predicts the variety 100% of the time, whereas, the dexision tree predixts it slightly less at 96% with accuracy of 0.96. Therefore, the logistic regression performs better making it a more reliable model.