# <centre> Beginner Friendly Tutorial
> Start your kaggle journey using this notebook and submit your first kaggle submission.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

## Exploring data

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
train.head()

In [None]:
train.info()

## Data Descriptons

Survival: 0 = No, 1 = Yes

Pclass (Ticket class): 1 = 1st, 2 = 2nd, 3 = 3rd

Sex: Sex

Age: Age in years

SibSp: number of siblings/spouses aboard the Titanic

Parch: number of parents/children aboard the Titanic

Ticket: Ticket number

Fare: Passenger fare

Cabin: Cabin number

Embarked: Port of Embarkation, C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
train.describe(include='all')

In [None]:
train.dtypes

In [None]:
train.shape

### Check for missing data

In [None]:
train.isna().sum()

In [None]:
sb.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

- We can clearly see missing values in column 'Age' & 'Cabin'.


In [None]:
train['Age'].isna().sum()

- Missing values in Age column is almost one quarter of our data which can be filled with average age.

#### Handling missing values

- So missing values in 'Age' column are filled with average age value
- However we can be smarter about this and check the average age by passenger class. 

In [None]:
plt.figure(figsize=(12, 7))
sb.boxplot(x='Pclass',y='Age',data=train,palette='winter')

We can fill the missing values with average age in particular Class.
- Class 1 has average age value around 35.
- Class 2 has average age value around 30.
- Class 3 has average age value around 24.

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 35

        elif Pclass == 2:
            return 30

        else:
            return 24

    else:
        return Age

In [None]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
train['Age'].isna().sum()

In [None]:
train['Cabin'].isna().sum()

- 'Cabin' has most of values missing,so we won't be using this feature for modelling.

In [None]:
train['Embarked'].isna().sum()

- Since only 2 missing values are there in 'Embarked' column, we can drop it.

In [None]:
train = train.dropna(subset=['Embarked'])

In [None]:
train['Embarked'].isna().sum()

In [None]:
#Check if any duplicate rows in dataset
train.duplicated().sum()

## Feature Selection

In [None]:
train.columns

In [None]:
train = train[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch','Fare', 'Embarked']]

## Visualizing data and finding relationships

In [None]:
sb.set_style('whitegrid')
sb.countplot(x='Survived',data=train,palette='RdBu')

In [None]:
sb.set_style('whitegrid')
sb.countplot(x='Survived',hue='Sex',data=train,palette='cool_r')

In [None]:
sb.set_style('whitegrid')
sb.countplot(x='Survived',hue='Pclass',data=train,palette='rainbow')

In [None]:
# Plots to see the distribution of the continuous features individually

plt.figure(figsize= (25,20))
plt.subplot(3,3,1)
sb.distplot(train['Age'],color='darkred',bins=20)


plt.subplot(3,3,2)
sb.distplot(train['Fare'],color='darkred',bins=20)

plt.subplot(3,3,3)
sb.distplot(train['Pclass'],color='darkred',bins=20)

plt.show()

In [None]:
sb.countplot(x='SibSp',data=train)

In [None]:
sb.barplot(y='Age',x='Pclass',data=train)
plt.xlabel('P class')
plt.ylabel('Age')

In [None]:
sb.countplot(x='Embarked',hue='Survived',data=train,palette='hot')

## Feature Encoding

###  Converting Categorical data

In [None]:
train.head()

In [None]:
# One hot encode the categorical columns
df_embarked_one_hot = pd.get_dummies(train['Embarked'], 
                                     prefix='embarked')

df_sex_one_hot = pd.get_dummies(train['Sex'], 
                                prefix='sex')

df_plcass_one_hot = pd.get_dummies(train['Pclass'], 
                                   prefix='pclass')

In [None]:
# Combine the one hot encoded columns with df_con_enc
train_one_hot = pd.concat([train, 
                        df_embarked_one_hot, 
                        df_sex_one_hot, 
                        df_plcass_one_hot], axis=1)
train_one_hot.head()



In [None]:
# Drop the original categorical columns (because now they've been one hot encoded)
train = train_one_hot.drop(['Pclass', 'Sex', 'Embarked'], axis=1)
train.head()

In [None]:
train.shape

## Building Machine Learning Models

### Split the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train.drop('Survived',axis=1)
Y = train['Survived']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [None]:
X_train.shape, Y_train.shape

## Train Models

## Gradient boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [None]:
gbc = GradientBoostingClassifier()

In [None]:
gbc.fit(X_train,Y_train)

In [None]:
Y_pred = gbc.predict(X_test)

In [None]:
gbc_acc = accuracy_score(Y_pred, Y_test)
print("Accuracy for Gradient boosting Classifier : ",gbc_acc)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lgr = LogisticRegression()

In [None]:
lgr.fit(X_train,Y_train)

In [None]:
Y_pred = lgr.predict(X_test)

In [None]:
lgr_acc = accuracy_score(Y_pred,Y_test)
print('Accuracy score for Logistic Regression: ',lgr_acc)

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB() 
gnb.fit(X_train, Y_train) 

In [None]:
Y_pred = gnb.predict(X_test)

In [None]:
gnb_acc = accuracy_score(Y_pred,Y_test)
print('Accuracy score for Naive Bayes: ',gnb_acc)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc= DecisionTreeClassifier(criterion='entropy', random_state=0)  
dtc.fit(X_train,Y_train)

In [None]:
Y_pred = dtc.predict(X_test)

In [None]:
dtc_acc = accuracy_score(Y_pred,Y_test)
print('Accuracy score for Decision Tree Classifier: ',dtc_acc)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)

In [None]:
Y_pred = rfc.predict(X_test)

In [None]:
rfc_acc = accuracy_score(Y_pred,Y_test)
print('Accuracy score for Random Forest Classifier: ',rfc_acc)

## SVM 

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,Y_train)

In [None]:
Y_pred = svc.predict(X_test)

In [None]:
svc_acc = accuracy_score(Y_pred,Y_test)
print('Accuracy score for SVM: ',svc_acc)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,Y_train)

In [None]:
Y_pred = knn.predict(X_test)

In [None]:
knn_acc = accuracy_score(Y_pred,Y_test)
print('Accuracy score for KNN: ',knn_acc)

#### Make a dataframe for comparing our Machine Learning Models

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
               'SVC', 'Random Forest',
              'Decision Tree', 'Gradient Boosting Classifier'],
    'Score': [
        knn_acc, 
        lgr_acc,  
        gnb_acc, 
        svc_acc,
        rfc_acc,
        dtc_acc,
        gbc_acc,
    ]})
print("--- Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

- We gained maximum accuracy score from "Gradient Boosting Classifier",so we will choose it for our final test data.

## Test Model on test data

In [None]:
test = pd.read_csv('../input/titanic/train.csv')
test.head()

In [None]:
test.isna().sum()

In [None]:
test['Age'] = test[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

In [None]:
test.isna().sum()

### Feature selection in test data

In [None]:
test.columns

In [None]:
test = test[[ 'PassengerId','Pclass','Sex', 'Age', 'SibSp', 'Parch','Fare','Embarked']]
test.head()

#### Convert categorical test data into numerical data

In [None]:
# One hot encode the categorical columns
df_embarked_one_hot_t = pd.get_dummies(test['Embarked'], 
                                     prefix='embarked')

df_sex_one_hot_t = pd.get_dummies(test['Sex'], 
                                prefix='sex')

df_plcass_one_hot_t = pd.get_dummies(test['Pclass'], 
                                   prefix='pclass')

In [None]:
# Combine the one hot encoded columns with df_con_enc
test_one_hot = pd.concat([test, 
                        df_embarked_one_hot_t, 
                        df_sex_one_hot_t, 
                        df_plcass_one_hot_t], axis=1)


In [None]:
# Drop the original categorical columns (because now they've been one hot encoded)
test = test_one_hot.drop(['Pclass', 'Sex', 'Embarked'], axis=1)
test.head()

In [None]:
test.columns

In [None]:
test.shape

### Assign train and test data

In [None]:
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId",axis=1)
X_train.shape, Y_train.shape, X_test.shape

## Gradient Boosting Classifier

In [None]:
gbc.fit(X_train, Y_train)
Y_pred_test = gbc.predict(X_test)

In [None]:
acc = round(gbc.score(X_train, Y_train) * 100, 2)
print("Accuracy Score for Gradient Boosting Classifier on test data: ",acc)

In [None]:
#Check for final test result columns
Y_pred_test.shape

## Submission

In [None]:
# Create a submisison dataframe and append the relevant columns
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = Y_pred_test # our model predictions on the test dataset
submission.head()

In [None]:
submission.shape

### Convert dataframe into csv file for submission

In [None]:
submission.to_csv(r'C:\Users\hp\Desktop\Kaggle\submission.csv',index=False)

In [None]:
# Check if we file is converted in csv and read it.
check = pd.read_csv(r'C:\Users\hp\Desktop\Kaggle\submission.csv')
check.head()