In [26]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder             #E1: see at last 
from sklearn.impute import SimpleImputer                                  #E2: see at last 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_squared_error


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# Loading the dataset
tr_data = pd.read_csv('drive/MyDrive/train.csv')                          #train dataset
ts_data = pd.read_csv('drive/MyDrive/test.csv')                           #test dataset
yts_data = pd.read_csv('drive/MyDrive/gender_submission.csv')             #y_true
   
tr_data = tr_data.drop(['Name','Ticket','Cabin'], axis=1)               
tr_data = tr_data.dropna()                                                # dropping rows corresponding to which no data given in "Age","Embarbed" 
ts_data = ts_data.drop(['Name','Ticket','Cabin'], axis=1)

x_train = tr_data.drop(['Survived'], axis=1)               
y_train = tr_data['Survived']

x_test = ts_data
y_test = yts_data


In [29]:
# nan values in "Embarked" col were a headache (IGNORE BOTTOM 3 CELLS)

In [30]:
x_train['Embarked'].isnull().values.any()

False

In [31]:
x_train.Embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

In [32]:
print(x_train)

     PassengerId  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0              1       3    male  22.0      1      0   7.2500        S
1              2       1  female  38.0      1      0  71.2833        C
2              3       3  female  26.0      0      0   7.9250        S
3              4       1  female  35.0      1      0  53.1000        S
4              5       3    male  35.0      0      0   8.0500        S
..           ...     ...     ...   ...    ...    ...      ...      ...
885          886       3  female  39.0      0      5  29.1250        Q
886          887       2    male  27.0      0      0  13.0000        S
887          888       1  female  19.0      0      0  30.0000        S
889          890       1    male  26.0      0      0  30.0000        C
890          891       3    male  32.0      0      0   7.7500        Q

[712 rows x 8 columns]


In [33]:
### ONE HOT ENCODING AND IMPUTING WAS DONE BEFORE DROPING ALL UNNECESSARY COLS SUCH AS NAME TICKET CABIN
### "CABIN" COULD HAVE BEEN AN IMP FEATURE BUT DUE TO INAPPROPRIATE DATA GIVEN FOR IT WAS DROPPED

In [34]:
# Preprocessing the data
    ## preprcs is done as the given dataset has some string values which the classifier cannot digest
    ## One-hot encoding used
categorical_cols = x_train.select_dtypes(include='object').columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    x_train[col] = label_encoders[col].fit_transform(x_train[col])

# Imputing missing values using mean strategy                                    
imputer = SimpleImputer(strategy='mean')
x_train = pd.DataFrame(imputer.fit_transform(x_train), columns=x_train.columns)

onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(onehot_encoder.fit_transform(x_train[categorical_cols]))
X_encoded.columns = onehot_encoder.get_feature_names_out(categorical_cols)
x_train = pd.concat([x_train.drop(categorical_cols, axis=1), X_encoded], axis=1)


categorical_cols = x_test.select_dtypes(include='object').columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    x_test[col] = label_encoders[col].fit_transform(x_test[col])

# Imputing
x_test = pd.DataFrame(imputer.fit_transform(x_test), columns=x_test.columns)

onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(onehot_encoder.fit_transform(x_test[categorical_cols]))
X_encoded.columns = onehot_encoder.get_feature_names_out(categorical_cols)
x_test = pd.concat([x_test.drop(categorical_cols, axis=1), X_encoded], axis=1)




In [35]:
print(x_train)

     PassengerId  Pclass   Age  SibSp  Parch     Fare  Sex_0.0  Sex_1.0  \
0            1.0     3.0  22.0    1.0    0.0   7.2500      0.0      1.0   
1            2.0     1.0  38.0    1.0    0.0  71.2833      1.0      0.0   
2            3.0     3.0  26.0    0.0    0.0   7.9250      1.0      0.0   
3            4.0     1.0  35.0    1.0    0.0  53.1000      1.0      0.0   
4            5.0     3.0  35.0    0.0    0.0   8.0500      0.0      1.0   
..           ...     ...   ...    ...    ...      ...      ...      ...   
707        886.0     3.0  39.0    0.0    5.0  29.1250      1.0      0.0   
708        887.0     2.0  27.0    0.0    0.0  13.0000      0.0      1.0   
709        888.0     1.0  19.0    0.0    0.0  30.0000      1.0      0.0   
710        890.0     1.0  26.0    0.0    0.0  30.0000      0.0      1.0   
711        891.0     3.0  32.0    0.0    0.0   7.7500      0.0      1.0   

     Embarked_0.0  Embarked_1.0  Embarked_2.0  
0             0.0           0.0           1.0  
1  

In [36]:
print(x_test)

     PassengerId  Pclass       Age  SibSp  Parch      Fare  Sex_0.0  Sex_1.0  \
0          892.0     3.0  34.50000    0.0    0.0    7.8292      0.0      1.0   
1          893.0     3.0  47.00000    1.0    0.0    7.0000      1.0      0.0   
2          894.0     2.0  62.00000    0.0    0.0    9.6875      0.0      1.0   
3          895.0     3.0  27.00000    0.0    0.0    8.6625      0.0      1.0   
4          896.0     3.0  22.00000    1.0    1.0   12.2875      1.0      0.0   
..           ...     ...       ...    ...    ...       ...      ...      ...   
413       1305.0     3.0  30.27259    0.0    0.0    8.0500      0.0      1.0   
414       1306.0     1.0  39.00000    0.0    0.0  108.9000      1.0      0.0   
415       1307.0     3.0  38.50000    0.0    0.0    7.2500      0.0      1.0   
416       1308.0     3.0  30.27259    0.0    0.0    8.0500      0.0      1.0   
417       1309.0     3.0  30.27259    1.0    1.0   22.3583      0.0      1.0   

     Embarked_0.0  Embarked_1.0  Embark

In [37]:
print(y_train)

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 712, dtype: int64


In [38]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

In [39]:
y_pred = knn.predict(x_test)
print(y_pred)

[0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 0]


In [40]:
knn_loss = mean_squared_error(y_test['Survived'],y_pred )

In [41]:
y_acc = knn.score(x_test,y_test['Survived'] )
print("KNN accuracy:",y_acc)

KNN accuracy: 0.645933014354067


In [42]:
nb = GaussianNB()
nb.fit(x_train, y_train)
nb_loss = mean_squared_error(y_test['Survived'], nb.predict(x_test))

In [43]:
y_acc = nb.score(x_test,y_test['Survived'] )
print("Naive Bayes Accuracy:",y_acc)

Naive Bayes Accuracy: 0.861244019138756


In [44]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_loss = mean_squared_error(y_test['Survived'], dt.predict(x_test))

In [45]:
y_acc = dt.score(x_test,y_test['Survived'] )
print("Decision Tree Accuracy:",y_acc)

Decision Tree Accuracy: 0.7511961722488039


In [46]:
# Printing the losses for each classifier
print("KNN Loss:", knn_loss)
print("Naive Bayes Loss:", nb_loss)
print("Decision Tree Loss:", dt_loss)

KNN Loss: 0.35406698564593303
Naive Bayes Loss: 0.13875598086124402
Decision Tree Loss: 0.24880382775119617


# Some errors with soln

***E1***

In [47]:
#------- preprocessing not done -----------------

# strings were present such as name and all in the dataset which the classifier could not understand 
# so converting then to 0, 1 values (mainly all 0)

***E2***

In [48]:
#------- NaN: after preprocessing data ------------

#       1 knn = KNeighborsClassifier()
# ----> 2 knn.fit(x_train, y_train)
#       3 knn_loss = mean_squared_error(y_test, knn.predict(x_test))

# ValueError: Input X contains NaN.
# KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier 
# and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data,
#  for instance by using an imputer transformer in a pipeline or drop samples with missing values.