In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## 1. Loading data

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

## 2. Combining Data

In [None]:
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train,test],ignore_index=True)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
data.nunique()

## 3. Data Visualization

In [None]:
plt.figure(figsize=(20,7))
plt.subplot(131)
sns.countplot(x='Survived',data=data)

plt.subplot(132)
sns.countplot(x='Pclass',hue='Survived',data=data)

plt.subplot(133)
sns.countplot(x='Sex',hue='Survived',data=data)
plt.show()

In [None]:
plt.figure(figsize=(20,7))
plt.subplot(131)
sns.countplot(x='Embarked',hue='Survived',data=data)

plt.subplot(132)
sns.countplot(x='SibSp',hue='Survived',data=data)

plt.subplot(133)
sns.countplot(x='Parch',hue='Survived',data=data)
plt.show()

## 4. Missing values

In [None]:
data.isnull().sum()

In [None]:
data.drop(['Name','Ticket'],axis=1,inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data['Age'].describe()

In [None]:
data['Age'].mode()

In [None]:
data['Age'].replace(np.nan,23.0,inplace=True)

In [None]:
data.loc[data.Age < 1, "Age"] = data.Age * 100
data['Age'].astype(int)

In [None]:
data['Embarked'].mode()

In [None]:
data['Embarked'].replace(np.nan,'S',inplace=True)

In [None]:
data['Fare'].fillna(data['Fare'].mean(),inplace=True)

In [None]:
data['Fare'] = data['Fare'].map(lambda i: np.log(i) if i > 0 else 0)

In [None]:
data['Cabin'].mode()

In [None]:
data['Cabin'].replace(np.nan,'C11139',inplace=True)

In [None]:
data.isna().sum()

In [None]:
#Get the first one character of cabin:
data['Cabin'] = data['Cabin'].apply(lambda x: x[0:1])
data['Cabin'].value_counts()

In [None]:
sns.countplot(x='Cabin',hue='Survived',data=data)
plt.show()

In [None]:
data['family_size'] = data['SibSp']+data['Parch']+1

In [None]:
data.head()

In [None]:
train_modified = data.loc[data['source']=='train']
test_modified = data.loc[data['source']=='test']

test_modified.drop(['source','Survived',],axis=1,inplace=True)
train_modified.drop('source',axis=1,inplace=True)

## 5. Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_modified['Cabin'] = le.fit_transform(train_modified['Cabin'])
test_modified['Cabin'] = le.fit_transform(test_modified['Cabin'])

In [None]:
#onehot encoding
train_modified = pd.get_dummies(train_modified, columns = ['Sex','Embarked'])
test_modified = pd.get_dummies(test_modified, columns = ['Sex','Embarked'])

In [None]:
X = train_modified.drop('Survived',axis=1)
y=train_modified['Survived']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=train_modified['Survived'],test_size=0.40)

## 6. feature selection

#### 6.1 Using Mutual Information(MI)

In [None]:
#from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
#mutual_info = mutual_info_classif(X_train, y_train)
#mutual_info

In [None]:

#mutual_info = pd.Series(mutual_info)
#mutual_info.index = X_train.columns
#mutual_info.sort_values(ascending=False)

In [None]:
#let's plot the ordered mutual_info values per feature
#mutual_info.sort_values(ascending=False).plot.bar(figsize=(14, 7))

In [None]:
#from sklearn.feature_selection import SelectKBest
#we Will select the  top 6 important features
#sel_five_cols = SelectKBest(mutual_info_classif, k=6)
#sel_five_cols.fit(X_train, y_train)
#X_train.columns[sel_five_cols.get_support()]

#### 6.2 Using Pearson Correlation

In [None]:
X_train.corr()

In [None]:
plt.figure(figsize=(14,14))
cor = X_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X_train,0.6)
len(set(corr_features))

In [None]:
corr_features

In [None]:
X_train.drop('PassengerId',axis=1)
X_test.drop('PassengerId',axis=1)
X_train.drop(corr_features,axis=1)
X_test.drop(corr_features,axis=1)
#X_train.drop(['Parch','SibSp'],axis=1)
#X_test.drop(['Parch','SibSp'],axis=1)

## 7. Model Building and Evaluation

#### 7.1 XGBOOST

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=6,
                    n_estimators=441, 
                    n_jobs=-1,
                    learning_rate=0.01,
                    nround = 9, #Number of Boosting rounds
                    early_stopping_rounds = 3, 
                    objective = "binary:logistic", 
                    gamma = 1)
xgb.fit(X_train,y_train)

In [None]:
pred = xgb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

In [None]:
test_pred = xgb.predict(test_modified)
test['Survived'] = test_pred.astype(int)
submission_1 =test[['PassengerId','Survived']]
#submission_1['Survived'] = np.where(test_pred>0.5, 1, 0)
submission_1.to_csv('xgb.csv',index=False)

#### 7.2 LIGHTGBM

In [None]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(max_depth=8,
                     n_estimators=541,
                     n_jobs=1,
                     num_leaves=8,
                    learning_rate=0.01)
lgb.fit(X_train,y_train)
pred = lgb.predict(X_test)

In [None]:
accuracy_score(y_test,pred)

In [None]:
test_pred = lgb.predict(test_modified)
test['Survived'] = test_pred.astype(int)
submission_1 =test[['PassengerId','Survived']]
#submission_1['Survived'] = np.where(test_pred>0.5, 1, 0)
submission_1.to_csv('lgb.csv',index=False)

#### 7.3 CATBOOST

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(n_estimators=350,
                             learning_rate=0.01,
                             max_depth=6)
cat.fit(X_train,y_train)

In [None]:
pred = cat.predict(X_test)
accuracy_score(y_test,pred)

In [None]:
test_pred = cat.predict(test_modified)
test['Survived'] = test_pred
submission_1 =test[['PassengerId','Survived']]
submission_1['Survived'] = np.where(test_pred>0.5, 1, 0)
submission_1.to_csv('cat.csv',index=False)

#### 7.4 RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000,
                            max_leaf_nodes = 30)
rf.fit(X_train,y_train)

In [None]:
pred = rf.predict(X_test)
accuracy_score(y_test,pred)

In [None]:
test_pred = rf.predict(test_modified)
test['Survived'] = test_pred.astype(int)
submission_1 =test[['PassengerId','Survived']]
#submission_1['Survived'] = np.where(test_pred>0.5, 1, 0)
submission_1.to_csv('random_forest.csv',index=False)

#### 7.5 EXTRA TREE CLASSIFIER

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
tree = ExtraTreesClassifier(n_estimators=900, 
                               max_depth=14, 
                               min_samples_leaf=40, 
                               n_jobs=1)
tree.fit(X_train,y_train)

In [None]:
pred = tree.predict(X_test)
accuracy_score(y_test,pred)

In [None]:
test_pred = tree.predict(test_modified)
test['Survived'] = test_pred.astype(int)
submission_1 =test[['PassengerId','Survived']]
#submission_1['Survived'] = np.where(test_pred>0.5, 1, 0)
submission_1.to_csv('extra_tree.csv',index=False)