In [None]:
''' Various Imports'''
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense

from sklearn import metrics
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold,GridSearchCV,KFold


import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
import seaborn as sns


import warnings
warnings.filterwarnings(action="ignore")

In [None]:
''' Importing Datasets'''
train_data = pd.read_csv('train.csv')

train_data = train_data[['PassengerId', 'Name', 'Sex', 'Age', 'Pclass' ,'Cabin', 'Ticket', 'Fare','SibSp',
       'Parch', 'Embarked', 'Survived']]

test_data = pd.read_csv('test.csv')

test_data = test_data[['PassengerId', 'Name', 'Sex', 'Age', 'Pclass' ,'Cabin', 'Ticket', 'Fare','SibSp',
       'Parch', 'Embarked']]

In [None]:
print(train_data.shape)
print(len(train_data),len(test_data))
train_data.head()

In [None]:
train_data.columns

In [None]:
print(test_data.shape)
test_data.head()

In [None]:
''' Combining train and test data sets for data processing steps '''

titanic_data = pd.concat((train_data.loc[:,'Sex':'Embarked'],test_data.loc[:,'Sex':'Embarked']))
titanic_data.drop(columns=['Ticket','Cabin'],inplace=True)
print(titanic_data.shape)
titanic_data.head()

In [None]:
titanic_data.describe()

In [None]:
titanic_data.info()

In [None]:
print("Number of unique values in each column")
{c:titanic_data[c].nunique() for c in titanic_data.columns}

In [None]:
print("Percentage of missing values in each column")
{c:100*titanic_data[c].isnull().sum()/len(titanic_data) for c in titanic_data.columns}

In [None]:
print(titanic_data.head(10))

In [None]:
''' Imputing missing values '''
imp = SimpleImputer(missing_values=np.nan,strategy='mean')
imp = imp.fit(titanic_data.iloc[:,[1,3]])
titanic_data.iloc[:,[1,3]] = imp.transform(titanic_data.iloc[:,[1,3]])
print(titanic_data[:11])

In [None]:
''' Splitting Age feature values into different groups '''
bins = [0,16,32,48,64,100]
labels = [1,2,3,4,5]

titanic_data['age_group'] = pd.cut(titanic_data.Age,bins,labels=labels).astype('int64')
print(titanic_data.age_group.value_counts())

In [None]:
''' creating new features '''
titanic_data['family_size'] = titanic_data.SibSp + titanic_data.Parch + 1

In [None]:
titanic_data['solo'] = 0
titanic_data.loc[titanic_data.family_size ==1,'solo'] =1
print(titanic_data.solo)

In [None]:
''' Filling Null values '''
titanic_data.Embarked = titanic_data.Embarked.fillna('S')

In [None]:
# titanic_data.drop(columns='Age',inplace=True)
titanic_data.head()

In [None]:
sns.set()
corr = titanic_data.corr()
high_corr = (corr >= 0.80).astype('uint8')
plt.figure(figsize=(5,5))
sns.heatmap(corr,cmap='RdBu_r',annot=True,center=0.0)
plt.show()

In [None]:
print("Percentage of missing values in each column :")
{c:100*titanic_data[c].isnull().sum()/len(titanic_data) for c in titanic_data.columns}

In [None]:
''' Checking for skewness in features '''
numeric_features = titanic_data.dtypes[titanic_data.dtypes != 'object'].index
skewness = pd.DataFrame({'skewness':titanic_data[numeric_features].apply(lambda x : stats.skew(x.dropna()))})
skewness = skewness.sort_values('skewness',ascending=False)
skewness.head()

In [None]:
''' Creating new dataframe with dummy vars and required features only '''
titanic_data_dummy = pd.get_dummies(titanic_data.drop(columns='Age'),columns=['Sex','Embarked'],drop_first=True)

In [None]:
titanic_data_dummy.info()
print(titanic_data_dummy.head(10))

In [None]:
''' Handling of skewed data  '''
titanic_data_dummy = np.log1p(titanic_data_dummy)
titanic_data_dummy.drop(columns=['SibSp','Parch'],axis=1,inplace=True)
print(titanic_data_dummy.head())

In [None]:
titanic_data_dummy.shape

In [None]:
sns.set()
sns.distplot(train_data.Survived,fit=stats.norm)
qqplot(train_data.Survived)
plt.show()

In [None]:
# sns.pairplot(titanic_data)

In [None]:
# sns.pairplot(titanic_data_dummy)

In [None]:
''' Creating train and test data sets '''
x_train = titanic_data_dummy.iloc[:train_data.shape[0],:].values
x_test =titanic_data_dummy.iloc[train_data.shape[0]:,:].values
y_train = train_data.Survived.values

In [None]:
print(len(x_train),len(y_train))
print(len(x_test))

In [None]:
# here ,paramter values have been updated after performing Randomized search for finding best estimators
classifier_rf = RandomForestClassifier(criterion='gini', 
                                           n_estimators=1100,
                                           max_depth=5,
                                           min_samples_split=4,
                                           min_samples_leaf=5,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=42,
                                           n_jobs=-1,
                                           verbose=1)
classifier_rf.fit(x_train,y_train)

In [None]:
param_grid = {'n_estimators':[100,300,500,1000],'max_features':['auto','sqrt','log2'],'max_depth':[2,4,6,8],
               'min_samples_leaf':[1,2,3,4],'criterion':['gini','entropy']}
cv = KFold(n_splits=10,shuffle=True,random_state=42)
param_search = RandomizedSearchCV(estimator=classifier_rf,param_distributions=param_grid,cv=cv)

In [None]:
param_search.fit(x_train,y_train)

In [None]:
print(param_search.best_estimator_)
param_search.best_score_

In [None]:
y_train_pred = classifier_rf.predict(x_train)
y_test_pred = classifier_rf.predict(x_test)
print(y_test_pred[:10])

In [None]:
classifier_gbc = GradientBoostingClassifier(n_estimators=240,max_depth=4,learning_rate=0.08,random_state=42)

In [None]:
classifier_NB = GaussianNB()
# classifier_NB.fit(x_train,y_train)

In [None]:
y_train_pred = classifier_NB.predict(x_train)
y_test_pred = classifier_NB.predict(x_test)
print(y_test_pred[:10])

In [None]:
classifier_xgb = XGBClassifier(max_depth=3,
                                learning_rate=0.1,
                                n_estimators=3000,
                                objective='binary:logistic',
                                random_state=42)
# classifier_xgb.fit(x_train,y_train)

In [None]:
classifier_lgbm = LGBMClassifier(boosting_type='gbdt',
                                    learning_rate=0.1,
                                    n_estimators=1000,
                                    objective='binary',
                                    random_state=42)
# classifier_lgbm.fit(x_train,y_train)

In [None]:
stack_classifier = VotingClassifier(estimators=[('rf',classifier_rf),('gbc',classifier_gbc),
                                                ('gnb',classifier_NB),('xgb',classifier_xgb),('lgbm',classifier_lgbm)],
                                    voting='hard')
stack_classifier.fit(x_train,y_train)

In [None]:
y_train_pred = stack_classifier.predict(x_train)
y_test_pred = stack_classifier.predict(x_test)

In [None]:
# Initialising the Neural Network
model = Sequential()

# layers
model.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 10))
model.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 3, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Dense()
# Train the ANN
model.fit(x_train, y_train, batch_size = 32, epochs = 200)

In [None]:
''' Predicting results '''
y_test_nn = model.predict(x_test)

y_test_pred = (y_test_nn>0.50).astype('int8').reshape(x_test.shape[0])

In [None]:
print(y_test_pred)

In [None]:
''' Checking accuracy '''
cm = metrics.confusion_matrix(y_train,y_train_pred)
print(cm)
print((cm[0][0]+cm[1][1])/cm.sum())

In [None]:
print('Accuracy score :',metrics.accuracy_score(y_train,y_train_pred))
print('Precision score :',metrics.precision_score(y_train,y_train_pred))
print('Recall score :',metrics.recall_score(y_train,y_train_pred))
print('F1 score :',metrics.f1_score(y_train,y_train_pred))


In [None]:
''' Exporting results for submission'''
solution = pd.DataFrame({"PassengerId":test_data.PassengerId, "Survived":y_test_pred})
solution.to_csv("submission_files/sachin_solution_nn.csv", index = False)
print(solution.head(10))