# Getting familiar with Pandas and NumPy

### 1/ Importing modules (or library)

In [281]:
import numpy as np
import pandas as pd

### 2/ Import dataset

In [282]:
path = "./dataset/train.csv"
separator = ","     # This means the figures are separated by the ',' symbol like csv

df = pd.read_csv(path, sep=separator)
test = pd.read_csv("./dataset/test.csv")

After importing dataset, it is vital to looking for null value, shape, etc.

In [283]:
df.shape

(891, 12)

In [284]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 3/ Change "Sex" and "Embarked" columns to numerical values

In [285]:
df.loc[df["Sex"] == "male", "Sex"] = 0
df.loc[df["Sex"] == "female", "Sex"] = 1
    
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1

In [286]:
df.loc[df["Embarked"] == "S", "Embarked"] = 0
df.loc[df["Embarked"] == "C", "Embarked"] = 1
df.loc[df["Embarked"] == "Q", "Embarked"] = 2

test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2

### 4/ Add SimpleImputer

In [287]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

df["Age"] = imputer.fit_transform(df[["Age"]]).ravel()
df["Embarked"] = imputer.fit_transform(df[["Embarked"]]).ravel()

test["Age"] = imputer.fit_transform(test[["Age"]]).ravel()
test["Embarked"] = imputer.fit_transform(test[["Embarked"]]).ravel()
test["Fare"] = imputer.fit_transform(test[["Fare"]]).ravel()

# Building a model predictor

### 1/ Choosing features

In [288]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

X = df[features]
y = df["Survived"]

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    float64
dtypes: float64(3), int64(3), object(1)
memory usage: 48.9+ KB


### 2/ Split test & build model


In [289]:
# from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train = X
X_test = test[features]
y_train = y

model = RandomForestClassifier()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

pd_pred = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": predictions})
pd_pred.to_csv("./dataset/submission.csv", index=False)