In [None]:
import pandas as pd

# Import the dataset

In [None]:
df = pd.read_csv('../input/titanic/train.csv')
df

# Data Exploratory Analysis

In [None]:
import plotly.express as px

fig = px.scatter(df, x='Fare',y = 'Age',color ='Survived' )
fig.show()
# px.set_xlabel('Age')

In [None]:
px.pie(df, 'Survived')

# Preprocessing

## Handling unwanted and missing values

In [None]:
df.isna().mean()

Embarked is useless

In [None]:
df.drop(['Embarked'],axis = 1,inplace = True)

Cabin is useless

In [None]:
df.drop(['Cabin'],axis = 1,inplace = True)

Replace Age N/A with mean

In [None]:
df.isna().sum()

In [None]:
age_mean = round(df['Age'].mean())
age_mean

In [None]:
df['Age'].fillna(age_mean, inplace=True)

In [None]:
df.isna().sum()

Passenger ID is useless

In [None]:
df.drop('PassengerId', axis=1,inplace=True)

Name is useless

In [None]:
df.drop('Name', axis=1, inplace = True)

Ticket is usless

In [None]:
df.drop('Ticket', axis = 1, inplace=True)

In [None]:
df

In [None]:
df.isna().mean()

## Handling categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

df['Sex'] = lb_make.fit_transform(df['Sex'])

In [None]:
df

## Data split

In [None]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Normalization

In [None]:
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

# Models

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score as acc

lr = LogisticRegression()

lr.fit(X_train_norm, y_train)

predictions_lr = lr.predict(X_test_norm)

print(acc(y_test, predictions_lr))

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score as acc

drc = DecisionTreeClassifier()

drc.fit(X_train_norm, y_train)

predictions_drc = drc.predict(X_test_norm)

print(acc(y_test, predictions_drc))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth = 6)

rfc.fit(X_train_norm, y_train)

predictions_rfc = rfc.predict(X_test_norm)

print(acc(y_test, predictions_rfc))

## Support Vector Machines

In [None]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(X_train_norm, y_train)

predictions_svc = svc.predict(X_test_norm)

print(acc(y_test, predictions_svc))

## K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(X_train_norm, y_train)

predictions_neigh = neigh.predict(X_test_norm)

print(acc(y_test,predictions_neigh))

# Final training

Since Random Forest performed the best I will now train the model on whole training dataset with RFC just to try slightly improve the final score.

In [None]:
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm_final = MinMaxScaler().fit(X)

# transform training data
X_norm_final = norm.transform(X)

In [None]:
rfc.fit(X_norm_final, y)

# Preparing test set for model usage

In [None]:
test = pd.read_csv('../input/titanic/test.csv')
test.shape

## Missing and unwanted values

In [None]:
PassengerId = test['PassengerId']

In [None]:
test.drop(['Cabin'],axis = 1,inplace = True)
test.drop('PassengerId', axis=1,inplace=True)
test.drop('Embarked', axis=1,inplace=True)
test.drop('Name', axis=1, inplace = True)
test.drop('Ticket', axis = 1, inplace=True)

test.shape

In [None]:
df.shape

In [None]:
test

In [None]:
df

## Missing values

In [None]:
test.isna().mean()

In [None]:
test.isna().sum()

* every missing value in age I will replace mith an age mean value of all passengers that age data is present in the dataset
* same with fare

In [None]:
test['Age'].fillna(test['Age'].mean(), inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [None]:
test.isna().sum()

In [None]:
test

## Categorical values

and categorical values copied from handling categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

test['Sex'] = lb_make.fit_transform(test['Sex'])

In [None]:
test

In [None]:
X

# Final predictions

In [None]:
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm_final_test = MinMaxScaler().fit(test)

# transform training data
X_norm_final_test = norm.transform(test)

In [None]:
predictions_final = rfc.predict(X_norm_final_test)
predictions_final

## Validating rows amount

In [None]:
predictions_final = pd.DataFrame(predictions_final)
predictions_final

In [None]:
len(predictions_final)

In [None]:
test.shape[0]

## Exporting the predictions

In [None]:
predictions_final['PassengerId'] = PassengerId

In [None]:
predictions_final.rename(columns = {0:'Survived'}, inplace = True)

In [None]:
predictions_final = predictions_final[['PassengerId','Survived']]
predictions_final

In [None]:
px.pie(predictions_final, 'Survived')

In [None]:
predictions_final.to_csv(r'gender_submission',index=False, header = True)