# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Importing the dataset

In [None]:
train_dataset = pd.read_csv("../input/titanic/train.csv")
test_dataset = pd.read_csv("../input/titanic/test.csv")

# EDA
## Having a look at the training dataset

In [None]:
train_dataset.head()

In [None]:
train_dataset.info()

## Let's know how many of the passengers survived

In [None]:
print(train_dataset.Survived.value_counts())
print(f"Number of people that did not survive is {train_dataset.Survived.value_counts()[0]}")
print(f"Number of people that survived is {train_dataset.Survived.value_counts()[1]}")
print(f"Percentage of people that did not survive is {(train_dataset.Survived.value_counts()[0] / train_dataset.Survived.count()) * 100}")
print(f"Percentage of people that survived is {(train_dataset.Survived.value_counts()[1] / train_dataset.Survived.count()) * 100}")

### Plot the number of survived and deceased

In [None]:
plt.figure(figsize=(8, 9))
survival_labels = ["Survived", "Deceased"]
survival_counts = [train_dataset.Survived.value_counts()[1], train_dataset.Survived.value_counts()[0]]
sns.barplot(survival_labels, survival_counts)
plt.show()

- Number of deceased is greater than the number of survived

### Out of the total 342 passengers that survived let's know passengers according to gender

In [None]:
print(f"Male Survival Count: {len(train_dataset[(train_dataset.Survived == 1) & (train_dataset.Sex == 'male')])}")
print(f"Female Survival Count: {len(train_dataset[(train_dataset.Survived == 1) & (train_dataset.Sex == 'female')])}")

### Plot the male and female survival visualization

In [None]:
gender_labels = ["Male", "Female"]
gender_survival_counts = [len(train_dataset[(train_dataset.Survived == 1) & (train_dataset.Sex == 'male')]), len(train_dataset[(train_dataset.Survived == 1) & (train_dataset.Sex == 'female')])]
plt.figure(figsize=(12, 14))
sns.barplot(gender_labels, gender_survival_counts)
plt.show()

- The important discovery here is that there were more female survivals. What can be the reason for this?

In [None]:
train_dataset.plot(figsize=(16, 10))
plt.show()

- Plotting the whole train_dataset does not give much. Its better to visualize using a heatmap

### What is the correlation between the features. Plot a heatmap for this

In [None]:
train_dataset_without_passenger_id = train_dataset.drop(["PassengerId"], axis=1)
corr = train_dataset_without_passenger_id.corr()
plt.figure(figsize=(16, 12))
sns.heatmap(corr)
plt.show()

## Plotting the Pairplot

In [None]:
sns.pairplot(train_dataset)
plt.show()

In [None]:
plt.figure(figsize=(16, 12))
sns.countplot(x="Survived", hue="Sex", data=train_dataset)
plt.show()

### Now observing each column what it means and how much impact it has on survived column

In [None]:
train_dataset.info()

In [None]:
train_dataset.shape

- `PassengerId` - Id of the passenger. It's integer type and does not affect the survived column
- `Survived` - target variable denoted by 0 - Deceased and 1 - Survived
- `Pclass` - Class of the passenger 1- First class 2-Second class and 3- third class
- `Name` - Name of the passenger
- `Sex` - Sex of the passenger
- `Age` - Age of the passenger
- `SibSp` - Number of siblings/spouses
- `Parch` - Number of parents / children
- `Ticket` - Ticket number
- `Fare` - Fare of the passenger
- `Cabin` - Cabin number
- `Embarked` - Embarkation port C - Cherbourg Q - Queenstown S - Southampton)

### There are 891 rows and 12 columns
- Plot barplot for numerical columns and pivot table for categorical columns
- PassengerId will not affect the Survived column so we can skip it 

In [None]:
train_dataset = train_dataset.drop("PassengerId", axis=1)

In [None]:
# All numerical columns
numerical_cols = train_dataset.select_dtypes(exclude=["object"]).columns
print(numerical_cols)

# All categorical columns
categorical_cols = train_dataset.select_dtypes(['object']).columns
print(categorical_cols)

#### Plotting bar graph for all numerical columns

In [None]:
for col in numerical_cols:
    plt.hist(train_dataset[col])
    plt.title(col)
    plt.show()

In [None]:
for col in categorical_cols:
    sns.barplot(train_dataset[col].value_counts().index, train_dataset[col].value_counts()).set_title(col)
    plt.show()

In [None]:
for col in train_dataset.columns:
    if col != "Ticket" and col != "Survived":
        print(f"Pivot table for {col} is ")
        print("--------------------------")
        print(pd.pivot_table(train_dataset, index="Survived", columns=col, values="Ticket", aggfunc='count'))
        print()
        print()

- We note that name of the passenger does not affect the Survival of the passenger

# Taking care of missing data

In [None]:
nans = train_dataset.isna().sum().sort_values(ascending=False)
missing_data = pd.concat([nans], axis=1, keys=["Total"])

In [None]:
# plotting the missing data
missing_data

- From this we can say that Cabin and Age have a lot of missing data 
- We will go through deleting the column for Cabin, deleting 2 rows for Emabarked and since Age plays some role we can replace the nans with mean values


In [None]:
train_dataset = train_dataset.drop(missing_data[missing_data.get("Total") > 177].index, 1)
print(train_dataset.shape)
train_dataset.head()

In [None]:
print(test_dataset.shape)
test_dataset.head()

- To make train and test dataset with same columns we have to drop `PassengerId` in test dataset and `Survived` column in train. Also we have to drop `Cabin` column from test set

In [None]:
X_test = test_dataset.drop(['PassengerId', 'Cabin'], axis=1)

In [None]:
print(X_test.shape)
X_test


In [None]:
# dropping the name feature from train and test dataset
train_dataset = train_dataset.drop(["Name"], axis=1)
X_test = X_test.drop(["Name"], axis=1)

In [None]:
print(train_dataset.shape)
print(X_test.shape)


- This is because survived is not there in test dataset.

### Deleting the rows in Embarked and replacing the mean value for Age

In [None]:
train_dataset = train_dataset.drop(train_dataset.loc[train_dataset.get("Embarked").isna()].index)
train_dataset["Age"] = train_dataset["Age"].fillna(train_dataset["Age"].mean())

In [None]:
# See if there is still any missing data in train dataset
train_dataset.isna().sum().max()

# Taking care of missing data for test dataset

In [None]:
# See if there is any missing data
X_test.isna().sum().sort_values(ascending=False)

In [None]:
# Replace the age with mean
X_test["Age"] = X_test["Age"].fillna(X_test["Age"].mean())

# Replace the Fare with mean value
X_test["Fare"] = X_test["Fare"].fillna(X_test["Fare"].mean())

In [None]:
# See if there is any missing value for test dataset
X_test.isna().sum().max()

# Encoding the Categorical data

## First Feature Engineering the `Ticket` Parameter

- the problem with encoding `Ticket` is - since each ticket is different and some contains letters the encoding will be too much

#### - Taking care of Tickets data
- You may observe that if the tickets contains letters then there will be a space in between
- Let's split the each row ticket value

In [None]:
train_dataset["Ticket"] = train_dataset["Ticket"].apply(lambda x: float(x.split()[-1]) if len(x.split()) > 1 else 0)
X_test["Ticket"] = X_test["Ticket"].apply(lambda x: float(x.split()[-1]) if len(x.split()) > 1 else 0)

train_dataset["Ticket"].astype(str).astype(float)
X_test["Ticket"].astype(str).astype(float)

## Now taking all the categorical columns and applying OneHotEncoder

In [None]:
# getting the categorical columns
categorical_cols = train_dataset.select_dtypes(["object"]).columns
print(categorical_cols)
# Applying one hot encoding on categorical cols
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')

In [None]:
X = train_dataset.drop(["Survived"], axis=1)

In [None]:
X = ct.fit_transform(X)

In [None]:
X.shape

In [None]:
X_test = ct.transform(X_test)

In [None]:
X_test.shape

## Assigning the target variable

In [None]:
y = train_dataset["Survived"]

In [None]:
y.shape

In [None]:
X.shape

# Splitting the training data into train and validity dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

# Building the model base with different classification techniques

## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=42)

### Training the model

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_val_predict = classifier.predict(X_val)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_val_predict)

In [None]:
from sklearn.metrics import confusion_matrix
con_mat = confusion_matrix(y_val, y_val_predict)
con_mat

In [None]:
plt.figure(figsize=(16, 12))
sns.heatmap(con_mat, annot=True)
plt.show()

In [None]:
y_preds = classifier.predict(X_test)

# Another way to apply the model is through cross val score method given in scikit learn
- Thanks to [this notebook](https://www.kaggle.com/kenjee/titanic-project-example), I found out about cross_val_score.

In [None]:
from sklearn.model_selection import cross_val_score
logistic_clf_model = LogisticRegression(random_state=42)
cv_score = cross_val_score(logistic_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

## 2. Support Vector Machine 

In [None]:
from sklearn.svm import SVC
svc_clf_model = SVC(gamma="auto", random_state=42)
cv_score = cross_val_score(svc_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

- This is an improvement on Logistic Regression

## 3. Naive Bayes (GaussianNB, MultinomialNB)

### 3.1 GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb_clf_model = GaussianNB()
cv_score = cross_val_score(gnb_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

### 3.2 MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb_clf_model = MultinomialNB()
cv_score = cross_val_score(mnb_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

- Gaussian NB performed bad than the SVM 
- Multinomial NB performed poorly compared to all the models above

## 4. Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf_model = SGDClassifier()
cv_score = cross_val_score(sgd_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

- Performed better than Naive Bayes and Logistic Regression but not better than SVM

## 5. KNN (K - Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh_clf_model = KNeighborsClassifier(n_neighbors=5)
cv_score = cross_val_score(neigh_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

**Interesting**
- KNN performed better than all the above models 

## 6. Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf_model = DecisionTreeClassifier(random_state=42)
cv_score = cross_val_score(dt_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

- DecisionTreeClassifier Model performed best till now

## 7. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf_model = RandomForestClassifier(n_estimators=150, max_depth=7, random_state=42)
cv_score = cross_val_score(rf_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

- Random Forest performed best

## 8. Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf_model = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=7, random_state=42)
cv_score = cross_val_score(gb_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

- Did not perform better than Gradient Boosting Classifier

## 9. LGBM Classifier

In [None]:
from lightgbm import LGBMClassifier
lgbm_clf_model = LGBMClassifier(learning_rate=0.1, n_estimators=150, max_depth=7, random_state=42)
cv_score = cross_val_score(lgbm_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

## 10. XGBoost Classifier

In [None]:
import xgboost as xgb
xgb_clf_model = xgb.XGBClassifier(random_state=42)
cv_score = cross_val_score(xgb_clf_model, X_train, y_train, cv=5)
print(cv_score)
print(cv_score.mean())

# Now training the dataset with the best model
**RandomForestClassifier**

In [None]:
rf_clf_model.fit(X_train, y_train)

In [None]:
y_preds = rf_clf_model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_dataset.PassengerId,
                      'Survived': y_preds})
output.to_csv('submission.csv', index=False)