In [82]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd
import os
import math
import random

In [97]:


names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000, hidden_layer_sizes=(100,100), random_state=7),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [4]:
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

In [5]:
for i in datasets:
    print(type(i), len(i) , len(i[0]), len(i[1]))
    # print(i[0])
    print(len(i[0]), type(i[0]))

<class 'tuple'> 2 100 100
100 <class 'numpy.ndarray'>
<class 'tuple'> 2 100 100
100 <class 'numpy.ndarray'>
<class 'tuple'> 2 100 100
100 <class 'numpy.ndarray'>


In [69]:
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)
    print(len(X_train), len(X_test))
    print(X_train, type(X_train))

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(score, name)

60 40
[[ 0.30920485 -0.127041  ]
 [-0.48691402  0.57259859]
 [ 0.50954902 -1.66427178]
 [ 1.35242892  0.60859541]
 [ 1.89600064 -0.42653506]
 [ 0.74779523  0.54957438]
 [-1.19805934  1.40047603]
 [-1.4065875  -1.03687185]
 [-0.33075371 -0.55325533]
 [ 0.81773837  1.64300265]
 [-1.30667592 -0.60711187]
 [-0.62663858 -0.45097084]
 [-0.22120141  1.00868763]
 [-0.85151619 -0.06022663]
 [-0.59739923  0.27829883]
 [ 0.15581821 -1.14573741]
 [ 0.07813527  0.18433247]
 [ 0.24130156 -1.74371889]
 [-0.42443599  1.30854456]
 [ 0.0081454  -0.58656254]
 [ 0.03138162  1.27867747]
 [-2.02716273 -0.07552655]
 [-0.20216417  0.09829492]
 [-1.77093289 -0.15602456]
 [ 1.01975478 -0.39964896]
 [ 1.22467352 -1.45537741]
 [ 0.19895122  2.27694191]
 [ 0.27524467  0.58976897]
 [-0.9843448   1.29546509]
 [ 0.51038341 -1.09925804]
 [ 0.88796385 -1.22007405]
 [ 0.15940018 -1.05506347]
 [-0.31251963  1.85361272]
 [-1.72018154 -0.11323539]
 [-2.05161935  0.3295589 ]
 [-0.81445041  0.03881959]
 [-0.1636123  -1.53782

In [107]:
def preprocess(df_train):
    # print (df_train.head())
    print(df_train.info())
    print(df_train.shape)
    column_names = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']
    # changing the sex values 
    print(df_train['Sex'].unique())
    df_train['Sex'] = df_train['Sex'].replace(['male', 'female'], [0, 1])
    print(df_train['Sex'].unique())
    # SipSp
    print(df_train['SibSp'].unique())
    # Parch
    print(df_train['Parch'].unique())
    # Embarked
    print(df_train['Embarked'].unique(), len(df_train['Embarked'].unique()))
    df_train['Embarked'] = df_train['Embarked'].fillna(0)
    df_train['Embarked'] = df_train['Embarked'].replace(['S', 'C', 'Q'], [1, 2, 3])
    print(df_train['Embarked'].unique(), len(df_train['Embarked'].unique()))
    # dropping columns
    df_train = df_train.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
    print(df_train.info())
    # changing nan from Age
    df_train['Age'] = df_train['Age'].fillna(0)
    print(df_train['Age'].unique())
    return df_train

In [116]:
# titanic dataset
train_file, test_file = os.path.join('datasets', 'train.csv'), os.path.join('datasets', 'test.csv')
df_train = pd.read_csv (train_file)   #read the csv file (put 'r' before the path string to address any special characters in the path, such as '\'). Don't forget to put the file name at the end of the path + ".csv"
df_test = pd.read_csv (test_file)
print(df_train.info(), df_test.info())
# y remove: 
y = df_train['Survived']
Y = y.to_numpy()
df_train = df_train.drop(columns=['Survived'])
# preprocess
df_train = preprocess(df_train)
# converting to numpy 
X = df_train.to_numpy()
print(X, Y)
# training
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.3)
print(len(X_train), len(X_test))
# print(X_train, type(X_train))

# iterate over classifiers
trained_classifiers = {}
for name, clf in zip(names, classifiers):
    print(clf.get_params())
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(score, name)
    trained_classifiers[name] = clf
print(df_train.info(), df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  



0.7835820895522388 Gaussian Process
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
0.8022388059701493 Decision Tree
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 1, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
0.7611940298507462 Random Forest
{'activation': 'relu', 'alpha': 1, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100, 100), 'learning_rate':

In [118]:
print(df_test.info())
pid = df_test['PassengerId']
print(pid)
df_test = preprocess(df_test)
X = df_test.to_numpy()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None
0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):