In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Importing dataset
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

In [None]:
# Analyzing dataset
train_data.columns

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
# Age has 3000+ null values. We can deal it by replacing with either mean or median
# Ticket has 4000+ null values
# Fare has 100+ null values
# Cabin has more than 60,000 null values
# Embark has 200+ null values

In [None]:
test_data.info()

In [None]:
# Age has 3000+ null values. We can deal it by replacing with either mean or median
# Ticket has 5000+ null values
# Fare has 100+ null values
# Cabin has more than 70,000 null values
# Embark has 200+ null values

In [None]:
train_data.describe()
# Mean age is 38

In [None]:
# Checking uniqueness in the categorical data
train_data.describe(include=['O'])

In [None]:
# Names have some duplicates(in thousands)
# Sex has 2 possible values and male covers 56.1% of total population.
# Ticket has also duplicates
# Cabin has duplicates because people shared cabin
# Embarked has 3 possible values.

In [None]:
# Looking at duplicate names
train_data['Name'].value_counts()

In [None]:
# Only for columns who have zero null values

# Pivoting features to observe correlation
# Correlation of Pclass with survived
train_data[['Pclass', 'Survived']].groupby(['Pclass']).mean()

In [None]:
# Correlation of SibSp with survived
train_data[['SibSp', 'Survived']].groupby(['SibSp']).mean()

In [None]:
# correlation of parch with survived
train_data[['Parch', 'Survived']].groupby(['Parch']).mean()

In [None]:
# Combining sibsp and parch into relative feature
train_data['Relatives'] = train_data['SibSp'] + train_data['Parch']
test_data['Relatives'] = test_data['SibSp'] + test_data['Parch']

In [None]:
train_data.head()

In [None]:
# Checking correlation of relative with survived
train_data[['Relatives', 'Survived']].groupby(['Relatives']).mean()

In [None]:
# Analyzing survival by visualizing it on plot
g = sns.FacetGrid(train_data, col='Survived')
g.map(plt.hist, 'Age', bins=30)

In [None]:
# Survival rate is high between 30 - 50 age group.
# Death rate is high in between age 20 - 30
# Through this plot infant death and survival is not that clear. It shows survival
# as well as death rate too.

In [None]:
f = sns.FacetGrid(train_data, row='Pclass', col='Survived', height=3.4)
f.map(plt.hist, 'Age', bins=30)

In [None]:
# This plotting shows that in Pclass=3, large no of passengers of age between
#0-10, 18-30 and 30-50 did not survive

In [None]:
k = sns.FacetGrid(train_data, row='Embarked', aspect=1.5)
k.map(sns.pointplot, 'Pclass','Survived', 'Sex')
k.add_legend()
# Female have high survival rate at each embarked

In [None]:
train_data['Sex'].value_counts(), train_data['Embarked'].value_counts()

In [None]:
combined_data = [train_data, test_data]
sex_mapping = {'male': 0, 'female': 1}
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
for data in combined_data:
    data['Sex'] = data['Sex'].map(sex_mapping)
    data['Embarked'] = data['Embarked'].map(embarked_mapping)
    data['Embarked'] = data['Embarked'].fillna(0)
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
    data['Age'] = data['Age'].fillna(data['Age'].mean())

In [None]:
train_data.head()

In [None]:
X = train_data.drop(['PassengerId', 'Survived', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
y = train_data['Survived'].copy()

In [None]:
X[:5]

In [None]:
X['Fare_family'] = X['Fare']*X['Relatives']
test_data['Fare_family'] = test_data['Fare']*test_data['Relatives']

In [None]:
X['Age*Pclass'] = X['Age']*X['Pclass']
test_data['Age*Pclass'] = test_data['Pclass']*test_data['Age']

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Using Logistic regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=500).fit(X_train, y_train)

In [None]:
log_reg.score(X_train, y_train)

In [None]:
log_reg.score(X_test, y_test)

In [None]:
# By Random forest
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestClassifier(max_depth=10)
rfr.fit(X_train, y_train)

In [None]:
rfr.score(X_train, y_train)

In [None]:
rfr.score(X_test, y_test)

In [None]:
# Using lightgbm
import lightgbm as lgb
params = {
    'metric': 'auc',
    'n_estimators': 15000,
    'objective': 'binary',
    'learning_rate': 0.001,
    'boosting': 'gbdt',
    'num_leaves': 40
}
lgb_class = lgb.LGBMClassifier(**params, max_depth=3, random_state=42)
lgb_class.fit(X_train, y_train)

In [None]:
lgb_class.score(X_train, y_train)

In [None]:
lgb_class.score(X_test, y_test)

In [None]:
# Importing actual testdata
test_data.head()

In [None]:
test_data = test_data.drop(['PassengerId','Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)

In [None]:
test_data.info()

In [None]:
test_data.head()

In [None]:
test_pred = log_reg.predict(test_data)

In [None]:
test_data['Survived'] = test_pred

In [None]:
test_data['PassengerId'] = np.arange(100000, 200000)

In [None]:
test_data.head()

In [None]:
test_data = test_data.drop(['Pclass', 'Sex', 'Age', 'Embarked', 'Fare', 'Relatives'], axis=1)

In [None]:
# Rearranging columns for submission
cols = ['PassengerId', 'Survived']
test_data = test_data[cols]

In [None]:
test_data.head()

In [None]:
# Importing submission file
test_data.to_csv('tabular_sol4.csv', index=False)