In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_dir = '../input/titanic/train.csv'
test_dir = '../input/titanic/test.csv'

# EDA

In [None]:
df=pd.read_csv(train_dir)
df.head()

In [None]:
df = df.drop(['PassengerId','Name','Ticket'],axis=1)
df.head()

In [None]:
df.info()


In [None]:
df.describe()

In [None]:
corr_matrix = df.corr().abs()
sns.heatmap(corr_matrix)

In [None]:
len(df['Survived'].unique())

## Number of Numerical Features

In [None]:
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
len(numerical_features)

In [None]:
numerical_features

## Discrete Features 

In [None]:
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature]
print("Continuous feature Count {}".format(len(continuous_feature)))

In [None]:
for feature in continuous_feature:
    data=df.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

In [None]:
sns.displot(x='Age', hue='Survived', data=df, alpha=0.6)
plt.show()

In [None]:
survived = df[df['Survived']==1]
sns.displot(survived.Age, kind='kde')
plt.show()

In [None]:
sns.displot(survived.Age, kind='ecdf')
plt.grid(True)
plt.show()

In [None]:
ranges = [0, 30, 40, 50, 60, 70, np.inf]
labels = ['0-30', '30-40', '40-50', '50-60', '60-70', '70+']

survived['Age'] = pd.cut(survived['Age'], bins=ranges, labels=labels)
survived['Age'].head()

In [None]:
sns.countplot(survived.Age)

In [None]:
survived.head()

In [None]:
sns.displot(survived.Fare,kind='kde'),sns.displot(df.Fare,kind='kde')

In [None]:
for feature in continuous_feature:
    data=df.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

## Handling Outliers

In [None]:
features = [i for i in df.columns]

In [None]:
D= df[(df['Survived'] != 0)]
H = df[(df['Survived'] == 0)]

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
from plotly.subplots import make_subplots
import plotly.express as px

%matplotlib inline

In [None]:
features_with_na=[features for features in df.columns if df[features].isnull().sum()>1]
## 2- step print the feature name and the percentage of missing values
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean(), 4),  ' % missing values')
features_with_na

**Since Cabin has almost 77% Nan values, we are dropping it.**

In [None]:
df = df.drop(['Cabin'],axis=1)

### Handling Nan Values with the most frequent category

In [None]:
def impute_nan_most_frequent_category(DataFrame,ColName):
    # .mode()[0] - gives first category name
     most_frequent_category=DataFrame[ColName].mode()[0]
    
    # replace nan values with most occured category
     DataFrame[ColName + "_Imputed"] = DataFrame[ColName]
     DataFrame[ColName + "_Imputed"].fillna(most_frequent_category,inplace=True)
#2. Call function to impute most occured category
for Columns in ['Embarked']:
    impute_nan_most_frequent_category(df,Columns)
    
# Display imputed result
df[['Embarked','Embarked_Imputed']].head(10)
#3. Drop actual columns
df = df.drop(['Embarked'], axis = 1)

## Visualising Outliers 

In [None]:
def plot_distribution(data_select, size_bin) :  
    
    tmp1 = D[data_select]
    tmp2 = H[data_select]
    hist_data = [tmp1, tmp2]
    
    group_labels = ['Survived']
    colors = ['#00FA9A']

    fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, 
                             bin_size = size_bin, curve_type='kde')
    
    fig['layout'].update(title = data_select)

    py.iplot(fig)

In [None]:
def plot_outliers(df, feat):
    
    trace0 = go.Box(
        y = df[feat],
        name = "All Points",
        jitter = 0.3,
        pointpos = -1.8,
        boxpoints = 'all',
        marker = dict(
            color = 'rgb(32,178,170)'),
        line = dict(
            color = 'rgb(32,178,170)')
    )
    trace1 = go.Box(
        y = df[feat],
        name = "Only Whiskers",
        boxpoints = False,
        marker = dict(
            color = 'rgb(0,128,128)'),
        line = dict(
            color = 'rgb(0,128,128)')
    )

    trace2 = go.Box(
        y = df[feat],
        name = "Suspected Outliers",
        boxpoints = 'suspectedoutliers',
        marker = dict(
            color = 'rgb(0,250,154)',
            outliercolor = '#FF69B4',
            line = dict(
                outliercolor = '#FF69B4',
                outlierwidth = 2)),
        line = dict(
            color = 'rgb(0,250,154)')
    )
    trace3 = go.Box(
        y = df[feat],
        name = "Whiskers and Outliers",
        boxpoints = 'outliers',
        marker = dict(
            color = 'rgb(47,79,79)'),
        line = dict(
            color = 'rgb(47,79,79)')
    )

    data = [trace0,trace1,trace2,trace3]

    layout = go.Layout(
        title = "{} Outliers".format(feat)
    )

    fig = go.Figure(data=data,layout=layout)
    py.iplot(fig)

In [None]:
def plot_all_feature():
    for feat in features[:1]:
        plot_distribution(feat, 0)
        plot_outliers(df, feat)
    plot_outliers(df, features[0])

In [None]:
def removeOutliers(df_out, feature, drop=False):

    valueOfFeature = df_out[feature]
    
    # Q1 (25th percentile) for the given feature
    Q1 = np.percentile(valueOfFeature, 25.)

    # Q3 (75th percentile) for the given feature
    Q3 = np.percentile(valueOfFeature, 75.)
    
    step = 1.5*(Q3-Q1)

    outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].index.tolist()
    feature_outliers = valueOfFeature[~((valueOfFeature >= Q1 - step) & (valueOfFeature <= Q3 + step))].values

    # Remove the outliers, if specified
    print ("Number of outliers (inc duplicates): {} and outliers: {}".format(len(outliers), feature_outliers))
    if drop:
        good_data = df_out.drop(df_out.index[outliers]).reset_index(drop = True)
        print ("New dataset with removed outliers has {} samples with {} features each.".format(*good_data.shape))
        return good_data
    else: 
        print ("Nothing happens, df.shape = ",df_out.shape)
        return df_out

In [None]:
df_clean = removeOutliers(df, features[0], True)
plot_outliers(df_clean, features[0])

In [None]:
df_clean = removeOutliers(df_clean, features[1], True)
plot_outliers(df_clean, features[1])

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)

# MODEL BUILDING

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import  BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature]
print("Continuous feature Count {}".format(len(continuous_feature)))

In [None]:
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature]
print("Continuous feature Count {}".format(len(continuous_feature)))

## Getting Categorical Variables

In [None]:
categorical_feature = [feature for feature in df.columns if df[feature].dtypes == 'O']
categorical_feature

In [None]:
scaler = StandardScaler()

# define the columns to be encoded and scaled


# encoding the categorical columns
data = pd.get_dummies(df, columns = categorical_feature, drop_first = True)
X = data.drop(['Survived'],axis=1)
y = data[['Survived']]


data[continuous_feature] = scaler.fit_transform(X[continuous_feature])

# defining the features and target
X = data.drop(['Survived'],axis=1)
y = data[['Survived']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [None]:
lr = LogisticRegression(random_state=42)

knn = KNeighborsClassifier()
para_knn = {'n_neighbors':np.arange(1, 50)}

grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5)

dt = DecisionTreeClassifier()
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 100), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5)

rf = RandomForestClassifier()

# Define the dictionary 'params_rf'
params_rf = {
    'n_estimators':[100, 350, 500],
    'min_samples_leaf':[2, 10, 30]
}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)

In [None]:

dt = DecisionTreeClassifier(criterion='gini', max_depth=20, min_samples_leaf=5, random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=2, random_state=42)

In [None]:
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt), ('Random Forest', rf)]

## Model Performances 

In [None]:
accuracy_list= []
model_name = []
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_pred, y_test) 
    accuracy_list.append(accuracy)
    model_name.append(clf_name)
    
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

### Adaboost Classifier 

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator=rf, n_estimators=100, random_state=1)

ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
accuracy_list.append(accuracy)
model_name.append('Adaboost')
accuracy

### Finding out Feature Importances

In [None]:
importances = pd.Series(data=rf.feature_importances_,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
plt.figure(figsize=(10, 10))
importances_sorted.plot(kind='bar',color='orange')
plt.title('Features Importances')
plt.show()

### Light GBM

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

def cross_val(X, y, model, params, folds=5):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict(x_test)
        accuracy = accuracy_score(y_test, pred)
#         log_loss_score = log_loss(y_test,pred)
        print(f" accuracy : {accuracy}")
        print("-"*50)
    return alg

In [None]:
lgb_params= {'learning_rate': 0.0001, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 5, 
             'max_depth': 30, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

In [None]:
from lightgbm import LGBMClassifier
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)

### XG Boost 

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = 10000,predictor = 'gpu_predictor',tree_method = 'gpu_hist',learning_rate = 0.01,max_depth=29,max_leaves = 31,eval_metric = 'mlogloss',verbosity = 3)
classifier.fit(X,y)


In [None]:
y_pred=classifier.predict(X_test)
y_test=np.array(y_test)
accuracy=accuracy_score(y_pred,y_test)
print("accuracy_score_XGBOOST: ",accuracy)
accuracy_list.append(accuracy)
model_name.append('XGboost')


In [None]:
plt.bar(model_name, accuracy_list , color ='green',
        width = 0.1)

# So we see that XGBOOST Performs The Best Among All Of Them

# Submission

In [None]:
test_df= pd.read_csv('../input/titanic/test.csv')
test_df.head()

## Finding the amount of Null Values 

In [None]:
features_with_na=[features for features in test_df.columns if test_df[features].isnull().sum()>1]
## 2- step print the feature name and the percentage of missing values
for feature in features_with_na:
    print(feature, np.round(test_df[feature].isnull().mean(), 4),  ' % missing values')
features_with_na

In [None]:
test_df=test_df.drop(['Cabin'],axis=1)
test_df.head()

In [None]:
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df.head()

In [None]:
passenger_id = test_df['PassengerId']

In [None]:
test_df=test_df.drop(['PassengerId','Name','Ticket'],axis=1)
test_df

In [None]:
categorical_feature = [feature for feature in test_df.columns if test_df[feature].dtypes == 'O']
categorical_feature

In [None]:
numerical_features = [feature for feature in test_df.columns if test_df[feature].dtypes != 'O']
discrete_feature=[feature for feature in numerical_features if len(test_df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature]
print("Continuous feature Count {}".format(len(continuous_feature)))

In [None]:
continuous_feature

In [None]:
test_df

# Preprocessing the Data

In [None]:
scaler = StandardScaler()

# define the columns to be encoded and scaled


# encoding the categorical columns
test_df= pd.get_dummies(test_df, columns = categorical_feature, drop_first = True)



test_df[continuous_feature] = scaler.fit_transform(test_df[continuous_feature])

# defining the features and target
test_df.head()

In [None]:
test_df = test_df.rename(columns = {'Embarked_Q': 'Embarked_Imputed_Q', 'Embarked_S': 'Embarked_Imputed_S'}, inplace = False)
test_df

In [None]:
values = classifier.predict(test_df)

In [None]:
df1= pd.DataFrame(passenger_id,columns=['PassengerId'])
df2= pd.DataFrame(values,columns=['Survived'])
test_submission= pd.concat([df1,df2],axis=1)
test_submission

In [None]:
test_submission.to_csv('submission.csv',index=False)
