In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import AdaBoostClassifier
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform as sp_uniform

## Reading Training and Testing data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
train.head()

## Making a copy of training dataset

In [None]:
train_copy = train.copy()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
test.head()

## Dropping Survived feature from train dataset

In [None]:
train.drop('Survived', axis=1, inplace = True)

## Combining train and test daatset

In [None]:
data = pd.concat([train,test], ignore_index=True)
data.head()

## Checking the shape of the dataset

In [None]:
data.shape

## Checking the null values from the dataset

In [None]:
data.isnull().sum()

## Imputing values in features (Age, Ticket, Fare, Cabin and Embarked)

In [None]:
mean_age = data['Age'].mean()
mean_age

In [None]:
data['Age'] = data['Age'].fillna(mean_age)

In [None]:
mean_fare = data['Fare'].mean()
mean_fare

In [None]:
data['Fare'] = data['Fare'].fillna(mean_fare)

In [None]:
data['Ticket'] = data['Ticket'].fillna('X')

In [None]:
data['Cabin'] = data['Cabin'].fillna('X')

In [None]:
mode_embarked = data['Embarked'].mode()[0]
mode_embarked

In [None]:
data['Embarked'] = data['Embarked'].fillna(mode_embarked)

## Splitting the Name column into two with First Name and Last Name

In [None]:
data[['First Name','Last Name']] = data.Name.str.split(",", expand=True,)

## Dropping the First Name and Name columns

In [None]:
data = data.drop(['Name','First Name'], axis=1)

In [None]:
data.head()

## Dropping Passenger ID feature

In [None]:
data.drop('PassengerId', axis=1,inplace = True)

In [None]:
data.head()

## Changing Pclass feature type from integer to object

In [None]:
data['Pclass'] = data['Pclass'].astype('object')

## Transforming the features

In [None]:
ordinal = OrdinalEncoder()
label = LabelEncoder()

In [None]:
data['Pclass'] = ordinal.fit_transform(data[['Pclass']])

In [None]:
data['Cabin'] = label.fit_transform(data['Cabin'])

In [None]:
data['Last Name'] = label.fit_transform(data['Last Name'])

In [None]:
data['Ticket'] = label.fit_transform(data['Ticket'])

In [None]:
data_dummies = pd.get_dummies(data[['Sex','Embarked']])

In [None]:
data = pd.concat([data,data_dummies], axis=1)
data.head()

In [None]:
data = data.drop(['Sex','Embarked'], axis=1)

## Separating training and testing dataset

In [None]:
train_data = data[0:100000]
test_data = data[100000:]

## Appending Survived feature to training dataset

In [None]:
train_data = pd.concat([train_data, train_copy['Survived']], axis=1)
train_data.head()

## Separating Independent and dependent feature

In [None]:
X = train_data.iloc[:,0:13]
y = train_data.iloc[:,13:]

## Splitting the data into train and test from training dataset

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

## Creating a pipeline for Logistic regression 

In [None]:
pipe_log = Pipeline([('scalar', StandardScaler()), ('log', LogisticRegression())])

In [None]:
pipe_log.fit(X_train,y_train)

In [None]:
score_log = pipe_log.score(X_test,y_test)

## Creating a pipeline for Adaboost Classifier

In [None]:
pipe_ada = Pipeline([('scalar', StandardScaler()), ('ada', AdaBoostClassifier())])

In [None]:
pipe_ada.fit(X_train,y_train)

In [None]:
score_ada = pipe_ada.score(X_test,y_test)

## Creating a pipeline for XGBoost Classifier

In [None]:
pipe_xgb = Pipeline([('scalar', StandardScaler()), ('xgb', XGBClassifier())])

In [None]:
pipe_xgb.fit(X_train,y_train)

In [None]:
score_xgb = pipe_xgb.score(X_test,y_test)

## Creating a pipeline for LGBMBooster Classifier

In [None]:
pipe_lgm = Pipeline([('scalar', StandardScaler()), ('lgm', LGBMClassifier())])

In [None]:
pipe_lgm.fit(X_train,y_train)

In [None]:
score_lgm = pipe_lgm.score(X_test,y_test)

In [None]:
score_df = pd.DataFrame({
    'Score': [score_log, score_ada, score_xgb, score_lgm]
})
score_df

In [None]:
prediction_lgm = pipe_lgm.predict(test_data)

In [None]:
prediction_lgm = pd.DataFrame(prediction_lgm)

In [None]:
prediction_lgm.to_csv('prediction_lgm.csv')