In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
train = train.append(test)

In [None]:
train

# Exploratory Analysis

In [None]:
train.info()

In [None]:
import missingno as msno
msno.bar(train, color='darkblue')

* Three columns need engineering: Cabin, Embarked and Age
* We will try to fix them in the best possible way we can 

# Cabin Column 

In [None]:
train['Cabin'].value_counts()

In [None]:
train['Cabin'].fillna(0).value_counts()

1. It seems that there are many missing values (687) 
2. We will have to drop this column as there seems to be no rrelevance of this column to predict survivors.  

In [None]:
train.drop('Cabin', axis=1, inplace=True)

# Embarked Column

In [None]:
train[train['Embarked'].isnull()]

In [None]:
train['Embarked'].fillna('Unknown', inplace=True)

* Filling Embarked column with unknown as there are only two rows. 
* We will not drop the column as there are sufficient rows to keep them in our analysis. 

# Age Column

In [None]:
train[train['Age'].isnull()]

In [None]:
train.groupby(['Pclass', 'Sex']).mean()

In [None]:
def impude_age(cols):
    Age=cols[0]
    Pclass = cols[1]
    Sex = cols[2]
    if pd.isnull(Age):
        if Pclass == 1 and Sex =='male':
            return 41
        elif Pclass == 1 and Sex =='female':
            return 34
        elif Pclass == 2 and Sex =='male':
            return 31
        elif Pclass == 2 and Sex =='female':
            return 29
        elif Pclass == 3 and Sex =='male':
            return 27
        else:
            return 22
    else:
        return Age

In [None]:
train['Age'] = train[['Age', 'Pclass', 'Sex']].apply(impude_age, axis=1)

* Age had some of the missing values.
* These were filled in using a special formula
* This formula took into account the PClass and Sex Column to define the missing age
* For example a person with PClass of A and Sex 'Male' will have the average of 41
* 41 came from the groupby that we performed earlier right before the formula above

In [None]:
msno.bar(train, color='green')

* Checking to see if all the columns are now equal
* Seems like everything is fine here

In [None]:
train.drop(['Name', 'Ticket'], axis=1, inplace=True)

* Dropping these columns as they do not carry any significance.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
train['Embarked'] = le.fit_transform(train['Embarked'])

* Changing the datatypes to numeric for Embarked and Sex

In [None]:
test = train[891:].drop('Survived',axis=1)
train = train[:891]

* Now that the feature engineering is done
* We will split the two sets back again.
* Train and Test sets

In [None]:
test.info()

In [None]:
train.info()

# Data Analyzing

In [None]:
a= train.corr()
plt.figure(figsize=(14,8))
sns.heatmap(a.corr(), annot=True, cmap='coolwarm')

# Analyzing Train Set for Outliers

In [None]:
fig, ax = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
a = sns.scatterplot(x='PassengerId', y='Fare', data=train,ax=ax[0][0], color='darkred', s=100)
b = sns.scatterplot(x='PassengerId', y='Age', data=train,ax=ax[0][1], color='darkgreen', s=100)
c = sns.scatterplot(x='PassengerId', y='Embarked', data=train,ax=ax[1][0], color='darkred', s=100)
d = sns.scatterplot(x='PassengerId', y='Parch', data=train,ax=ax[1][1], color='darkred', s=100)

a.set_title('Fare Outliers', fontsize=20)
b.set_title('Age Outliers', fontsize=20)
c.set_title('Embarked Outliers', fontsize=20)
d.set_title('Parch Outliers', fontsize=20)



plt.tight_layout()

* The analysis for outliers show that Fare, Embarked and Parch column have some outliers. 
* We will try to remove these outlier rows in each of the columns to make sure our analysis is accurate
* Outliers can really influence the final model.
* they tend to drag averages up or down and can really change the predictive capabilities of the model.

In [None]:
q_hi_f = train['Fare'].quantile(0.95)
q_low_f = train['Fare'].quantile(0)
train_1 = train[(train['Fare'] >= q_low_f) & (train['Fare']<q_hi_f)]

In [None]:
q_hi_e = train_1['Embarked'].quantile(0.9999)
q_low_e = train_1['Embarked'].quantile(0)
train_2 = train_1[(train_1['Embarked'] >= q_low_e) & (train_1['Embarked']<q_hi_e)]

In [None]:
q_hi_p = train_2['Parch'].quantile(0.97)
q_low_p = train_2['Parch'].quantile(0)
train_3 = train_2[(train_2['Parch'] >= q_low_p) & (train_2['Parch']<=q_hi_p)]

In [None]:
fig, ax = plt.subplots(nrows=2,ncols=2, figsize=(10,5))
a = sns.scatterplot(x='PassengerId', y='Fare', data=train_3,ax=ax[0][0], color='darkgreen', s=100)
b = sns.scatterplot(x='PassengerId', y='Age', data=train_3,ax=ax[0][1], color='darkgreen', s=100)
c = sns.scatterplot(x='PassengerId', y='Embarked', data=train_3,ax=ax[1][0], color='darkgreen', s=100)
d = sns.scatterplot(x='PassengerId', y='Parch', data=train_3,ax=ax[1][1], color='darkgreen', s=100)

a.set_title('Fare Outliers', fontsize=20)
b.set_title('Age Outliers', fontsize=20)
c.set_title('Embarked Outliers', fontsize=20)
d.set_title('Parch Outliers', fontsize=20)



plt.tight_layout()

* As we can see in the above visuals now that the outliers have been very thoroughly removed 
* each column is dealt seperately to cut out the outliers which might create noise in the model
* All this data is now around cleaned from extreme values!

In [None]:
train_3 = train_3.drop('PassengerId',axis=1)
test_1 = test.drop(['PassengerId'],axis=1)

* We will delete the passengerId column now since we do not need it anymore. 
* We required it before to do the outlier analysis

# Oversampling Analysis for Train Set

In [None]:
train_3['Survived'].value_counts()

* There are imbalances in the fields
* The problem of oversampling will occur in the model
* This is because we have more labels of one type.
* To fix this we will need to balance the dataset. 
* This can be done by oversampling analysis

In [None]:
train_3.columns

In [None]:
from imblearn.over_sampling import SMOTE
os = SMOTE()
X_train, y_train  = os.fit_resample(train_3[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked']], train_3['Survived'])

In [None]:
y_train.value_counts()

* The issue of oversampling is now resolved.
* Both the labels have now equal count of values that will be inserted in the model

# Scaling the data

In [None]:
from sklearn.preprocessing import MinMaxScaler
mn = MinMaxScaler()
X_train_scaled = mn.fit_transform(X_train)
X_test_scaled = mn.transform(test_1)

* The data is now scaled 
* We are now ready to apply this data to a model

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=[X_train.columns])
X_train_scaled

In [None]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns=test_1.columns)
X_test_scaled

In [None]:
val_test_X = X_train_scaled[:300]
val_test_y = y_train[:300]
X_train_scaled_2 = X_train_scaled[300:]
y_train_scaled_2 = y_train[300:]

#  Predictive Model..Working on it right now

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
model = Sequential()
model.add(Dense(units= 7, activation='relu'))
#model.add(Dropout(0.5))
#model.add(Dense(units= 14, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(units= 3, activation='relu'))
model.add(Dense(units=1,kernel_initializer='uniform', activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=40)

In [None]:
model.fit(x=X_train_scaled_2, y=y_train_scaled_2, epochs=1000, batch_size=200,
          validation_data=(val_test_X, val_test_y)) #callbacks=early_stopping)

In [None]:
pd.DataFrame(model.history.history).plot()

In [None]:
p = model.predict_classes(X_test_scaled)
p = pd.DataFrame(p)

In [None]:
predictions = pd.concat([test, p], axis=1)
#predictions = predictions.drop('Survived', axis=1)
predictions = predictions.rename(columns={0:'Survived'})
predictions = predictions[['PassengerId', 'Survived']]
predictions

# Predictive Analysis Machine Learning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

* We can see that random forest classifier, decision tree and XGBoost have performed nearly the same.
* We will chose one of these as they have showed good results.

In [None]:
X_test_scaled['Fare'] = X_test_scaled['Fare'].fillna(X_test_scaled['Fare'].mean())
X_test_scaled.info()

In [None]:
X = X_train_scaled[['Pclass', 'Sex', 'Fare', 'Age', 'Parch', 'Embarked', 'SibSp']]
y = pd.DataFrame(y_train)
rfc = RandomForestClassifier(n_estimators=500, max_depth=7)
rfc.fit(X, y)
predict = rfc.predict(X_test_scaled[['Pclass', 'Sex', 'Fare', 'Age', 'Parch', 'Embarked', 'SibSp']])

In [None]:
a = pd.DataFrame(test['PassengerId'])
b = pd.DataFrame(predict, columns=['Survived'])
prediction = pd.concat([a, b], axis=1)
prediction

In [None]:
prediction.to_csv('Predictions.csv', index=False)