In [None]:
import numpy             as np 
import pandas            as pd 
import matplotlib.pyplot as plt
import seaborn           as sns 
%matplotlib inline 

from sklearn.preprocessing   import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics         import roc_curve
from sklearn.ensemble        import RandomForestClassifier
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

* ##### If you are in the intermmediate level & trying to hone your skills in algorithims such as Random forest & Logistic Regression this notebook is for you.
* #####  I will be using Logistic Regression,Random forest at the first and then use ensembles to improvise the accuracy further.

**Topics that we will cover in this notebook are:-**
1. Data cleaning & Preprocessing by creating dummies, Scaling.
2. Model building using Logistic Regression, Random Forest & Ensembles.
3. Using RFE(Recursive feature elimination) for feature elimination.
4. VIF (Variation inflation Factor) to detect multicolinearlity.
5. Automatic Hyperparameter tunning using Sklearn.
6. Model evalution using accuracy, F1 score, Recall,precision.

#### 1. DATA IMPORTING & ANALYZING

In [None]:
#Importing train & test data 
df_train = pd.read_csv(r'../input/tabular-playground-series-apr-2021/train.csv')
df_test  = pd.read_csv(r'../input/tabular-playground-series-apr-2021/test.csv')
test_id  = pd.read_csv(r'../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
#Analyzing the train & test dataset
print(df_train.info())
print(df_test.info())

#### 2. DATA CLEANING

In [None]:
#Analyzing the null values in train, test dataframe
print((df_train.isnull().sum()/len(df_train))*100)
print((df_test.isnull().sum()/len(df_test))*100)

In [None]:
#As we  can see in the column Cabin, more than 50% of the values are missing,we will delete the column 
df_train.drop(columns='Cabin',axis=1,inplace=True)
df_test.drop(columns='Cabin',axis=1,inplace=True)

In [None]:
#In some of the columns there are null values are less than 5% so we will delete these null values 
df_train.dropna(axis=0,inplace=True)

In [None]:
#Function to impute the null values with most frequent values 
def impute(df):
    from sklearn.impute import SimpleImputer
    my_imputer           = SimpleImputer(strategy='most_frequent')
    imputed_data         = pd.DataFrame(my_imputer.fit_transform(df))
    imputed_data.columns = df.columns
    return imputed_data

In [None]:
df_test = impute(df_test)

In [None]:
#Checking the dataframe again for null values 
print((df_train.isnull().sum()/len(df_train))*100)
print((df_test.isnull().sum()/len(df_test))*100)

#### 3. DATA PREPROCESSING 

In [None]:
#Checking the dataframe 
df_train.head()

In [None]:
#Converting the categorical columns into numerical data 
df_train['Sex'] = df_train['Sex'].apply(lambda x:1 if x=='male' else 0)
df_test['Sex']  = df_test['Sex'].apply(lambda x:1 if x=='male' else 0)

In [None]:
#Analysing other categorical columns like 'EMBARKED'
df_train['Embarked'].value_counts()

In [None]:
#Converting the 'EMBARKED' column into a dummy columns as it has more than 2 uniquie value 
train_dummy = pd.get_dummies(df_train['Embarked'],drop_first=True)
test_dummy  = pd.get_dummies(df_test['Embarked'],drop_first=True)

In [None]:
#Merging the dummies with the main dataframe
df_train = pd.concat([df_train,train_dummy],axis=1)
df_test  = pd.concat([df_test,test_dummy],axis=1)

In [None]:
#Dropping the main column 'Embarked'
df_train.drop(columns='Embarked',axis=1,inplace=True)
df_test.drop(columns='Embarked',axis=1,inplace=True)

In [None]:
#As we can see name,passengerID,Ticket column can be dropped as they wont have any effect on survival
df_train.drop(columns=['PassengerId','Name','Ticket'],axis=1,inplace=True)
df_test.drop(columns=['PassengerId','Name','Ticket'],axis=1,inplace=True)

In [None]:
#Checking if any outliers exits in the data 
sns.boxplot(df_train['Age']);
#We can conclude that there is no outlier 
#Make a point we won't be analysing the test data for outliers as we have to consider it as hidden 

In [None]:
#Preprocessing the data using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaling                = MinMaxScaler()
scaling_col            = ['Age','Fare']
df_train[scaling_col]  = scaling.fit_transform(df_train[scaling_col])
df_test[scaling_col]   = scaling.transform(df_test[scaling_col])

#### If you are curious to know why we did ony tranform for test data set & fit_tranform to train.
visit the link below [https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe](http://)

In [None]:
#Checking the dataframe one last time
df_train.head()

#### 4. EXPLORATORY DATA ANALYSIS 

In [None]:
#Checking the correlation between the features 
sns.heatmap(df_train.corr(),annot=True)
#Features which are highly correlated with the target variables are 
#  ------>>> Sex,Pclass,S,Fare,Age

In [None]:
#Checking if the data is balanced or imbalanced
sns.countplot(df_train['Survived']);

In [None]:
(df_train['Survived'].value_counts(normalize=True))*100
#The dataset is not balanced but we cannot deam it to the category of imbalance too

#### 5. MODEL BUILDING & Evaluation

#### 1. We are ready for the End game now, I will be using the below approach to build the best model.
#### *     a. Building the best 'Logistic Regression model', submit pred & check accuracy.
#### *     b. Building the best ' Random forest' model, submit pred & check accuracy.
#### *     c. Finally, building a 'Ensemble' to check if we can maximize the accuarcy. 

In [None]:
#Function to check the VIF of the df
def vif_validation(X_train):
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    # Create a dataframe that will contain the names of all the feature variables and their respective VIFs
    vif = pd.DataFrame()
    vif['Features']  = X_train.columns
    vif['VIF']       = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
    vif['VIF']       = round(vif['VIF'], 2)
    vif              = vif.sort_values(by = "VIF", ascending = False)
    return vif

In [None]:
#Function to create a table with pred values for logistic regression 
#Function will return a dataframe with predicted values
def prediction(model_name,x_test,y_test,thres):
    y_pred                        = model_name.predict(x_test)
    y_pred_final                  = pd.DataFrame({'Prob':y_pred})
    y_pred_final['Survived']      = y_test
    y_pred_final['pred']          = y_pred_final['Prob'].apply(lambda x:1 if x>thres else 0)
    return y_pred_final

In [None]:
#Function to create a table with pred values for logistic regression 
#Function will return a dataframe with predicted values
def test_prediction(model_name,x_test,thres):
    y_pred                        = model_name.predict(x_test)
    y_pred_final                  = pd.DataFrame({'Prob':y_pred})
    y_pred_final['pred']          = y_pred_final['Prob'].apply(lambda x:1 if x>thres else 0)
    return y_pred_final

In [None]:
#function to test the logistic Regression model 
def validating_lr(y_real,y_pred):
    from sklearn.metrics import confusion_matrix, accuracy_score
    print('Confusion Matrix')
    confusion = confusion_matrix(y_pred,y_real)
    print(confusion)
    print('\n')
    print('Accuracy Score',(accuracy_score(y_pred,y_real)*100))
    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives
    print('\n')
    print('Sensitivity:',(TP / float(TP+FN)*100))
    print('\n')
    print('specificity:',(TN / float(TN+FP)*100))
    print('\n')
    print('false postive rate - predicting 1 when its 0:',(FP/ float(TN+FP)*100))
    print('\n')
    print('Positive predictive value:',(TP / float(TP+FP)*100))
    print('\n')
    print('Negative predictive value:',(TN / float(TN+ FN)*100))

In [None]:
#Function to get the probablities for all possible threshold
def optimum_threshold(y_pred):
    numbers = [float(x)/10 for x in range(10)]
    for i in numbers:
        y_pred[i]= y_pred.Prob.map(lambda x: 1 if x > i else 0)
    return y_pred

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
def optimum_accuracy(df,op):
    cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
    from sklearn.metrics import confusion_matrix

    # TP = confusion[1,1] # true positive 
    # TN = confusion[0,0] # true negatives
    # FP = confusion[0,1] # false positives
    # FN = confusion[1,0] # false negatives

    num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in num:
        cm1              = confusion_matrix(df[op],df[i] )
        total1           = sum(sum(cm1))
        accuracy         = (cm1[0,0]+cm1[1,1])/total1
        speci            = cm1[0,0]/(cm1[0,0]+cm1[0,1])
        sensi            = cm1[1,1]/(cm1[1,0]+cm1[1,1])
        cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
    return cutoff_df

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
def Roc_plotting(df):
    df.plot.line(x='prob', y=['accuracy','sensi','speci'])
    plt.show()

In [None]:
#Splitting the data into different parts 
X_train = df_train.drop(columns='Survived',axis=1).copy()
y_train = df_train['Survived']

In [None]:
#Building a Logistic Regression model using Statsmodel
import statsmodels.api as sm
lr    = sm.GLM(y_train,sm.add_constant(X_train),family=sm.families.Binomial())
lr_1  = lr.fit()
print(lr_1.summary())

In [None]:
#Analyzing if Multicolinearlity exits in the data 
vif_validation(X_train)

#### 1. We have two situations here wherin we have column 'Pclass' with high VIF & low P-value.
#### 2. Second column is 'Age' with high p-value & low VIF.
#### 3. We will eliminate the column 'Pclass' as it has high VIF & also it is redundant as 'Fare' column is already present in the training dataset.

In [None]:
#Dropping the column 'Pclass' from the training data set 
X_train.drop(columns='Pclass',axis=1,inplace=True)
df_test.drop(columns='Pclass',axis=1,inplace=True)

In [None]:
#Rebuilding the model again with new_training dataframe 
lr    = sm.GLM(y_train,sm.add_constant(X_train),family=sm.families.Binomial())
lr_2  = lr.fit()
print(lr_2.summary())

In [None]:
#Analyzing if Multicolinearlity exits in the data 
vif_validation(X_train)

##### * 1. As we can observe that the Multicolinearlity has been reduced significantly & the p-value of all the features is less & they all are sinificant.*

In [None]:
#Getting the prediction using train data & validating the model 
y_pred = prediction(lr_2,sm.add_constant(X_train),y_train,0.5)
y_pred.head()

In [None]:
#Evaluting the model accuracy 
validating_lr(y_train,y_pred['pred'])

In [None]:
#Getting prediction for all the thresholds
y_pred = optimum_threshold(y_pred)
y_pred.head()

In [None]:
#Plotting the ROC to analyse & choose the best threshold to maximize the accuracy
cutoff_df = optimum_accuracy(y_pred,'Survived')
cutoff_df

In [None]:
#Analysing the ROC 
Roc_plotting(cutoff_df)

In [None]:
#Lets try threhold value somewhere around 0.45-0.48
#Getting the prediction using train data & validating the model 
y_pred = prediction(lr_2,sm.add_constant(X_train),y_train,0.5)
y_pred.head()
#Evaluting the model accuracy 
validating_lr(y_train,y_pred['pred'])

In [None]:
df_test_sm      = sm.add_constant(df_test)
df_test_sm.head()

In [None]:
df_test[['Parch','SibSp']] = df_test[['Parch','SibSp']].astype('float64')

In [None]:
#Concatenanting the passgerID & Survival rate 
df_test_sm      = sm.add_constant(df_test)
lr_2_test_pred  = test_prediction(lr_2,df_test_sm,0.5)
submission_file = pd.DataFrame({'PassengerID':test_id['PassengerId'],'Survived':lr_2_test_pred['pred']})
submission_file.to_csv('submission_1.csv', index=False)

* #### As of now lr_2 is the best model we are getting with 76% train_accuracy, Let's submit & check the test accuracy of around 79.38% I am not sure how this is possible but lets move on to build a Random forest model to further improve the accuracy

In [None]:
#Building a Decision Tree & then fitting it to the RandomForest model 
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=5,min_samples_split=150,min_samples_leaf=150)
dt.fit(X_train, y_train)

In [None]:
#Checking the accuracy of the Decision Tree 
from sklearn.metrics import confusion_matrix, accuracy_score
y_train_pred = dt.predict(X_train)
print(accuracy_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)

#We can observe that we are getting almost similar accuracy so we will do some hyperparameter tunning

In [None]:
#Tunning the Hyperparameters 
from sklearn.model_selection import GridSearchCV
dt = DecisionTreeClassifier(random_state=42)
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [2, 3, 4, 5, 6, 8, 10],
    'min_samples_leaf': [30,50,100,150,200,250,300],
    'min_samples_split': [30,50,100,150,200,250,300],
    'criterion': ["gini", "entropy"]
}
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")
grid_search.fit(X_train, y_train)

In [None]:
#Building a RandomForest model 
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.nlargest(5,"mean_test_score")

In [None]:
grid_search.best_estimator_

In [None]:
#Training the DecisionTree with best hyperparameter to get maximum accuracy 
dt = DecisionTreeClassifier(max_depth=10,min_samples_split=250,min_samples_leaf=100,criterion='entropy',random_state=42)
dt.fit(X_train, y_train)
y_train_pred = dt.predict(X_train)
print(accuracy_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)
#Our accuracy has slightly increased

In [None]:
#Concatenanting the passgerID & Survival rate 
dt_test_pred    = dt.predict(df_test)
submission_file = pd.DataFrame({'PassengerID':test_id['PassengerId'],'Survived':dt_test_pred})
submission_file.to_csv('submission_2.csv', index=False)

#### Th accuracy using the DecisionTree model is only 77.58% so lets try to build a RandomForest model

In [None]:
#Building a RandomForestClassifier
rf = RandomForestClassifier(n_estimators=30, max_depth=10, max_features=5, random_state=100, oob_score=True)
rf.fit(X_train, y_train)
rf.oob_score_

In [None]:
#Hyperparameter tunning for RandomForest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth'        : [5,10,20],
    'min_samples_leaf' : [50,100,150,200,250,300],
    'min_samples_split': [100,150,200,250,300],
    'n_estimators'     : [10, 25, 50, 100]
}
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")
grid_search.fit(X_train, y_train)

In [None]:
#Getting the best hyperparameters 
grid_search.best_estimator_

In [None]:
#Fitting the model to new hyperparameters
rf = RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=250,
                       n_estimators=50, n_jobs=-1, random_state=42,oob_score=True)
rf.fit(X_train, y_train)
rf.oob_score_

In [None]:
#Checking the accuracy on the train dataset
y_train_pred = rf.predict(X_train)
print(accuracy_score(y_train, y_train_pred))
confusion_matrix(y_train, y_train_pred)
#We can clearly observe that there is a slight improvement in the overall accuracy of the model

In [None]:
#Checking the model accuracy on the test dataset 
#Concatenanting the passgerID & Survival rate 
rf_test_pred         = rf.predict(df_test)
submission_file      = pd.DataFrame({'PassengerID':test_id['PassengerId'],'Survived':rf_test_pred})
submission_file.to_csv('submission_3.csv', index=False)

* #### Now, that we have used RandomForest as well and did not see any significant improvement in the model accuracy lets use Ensembles to predict the survival rate

In [None]:
#We will create an ensemble using Logistic Regression & RandomForest (Decision Tree)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree         import DecisionTreeClassifier
from sklearn.ensemble     import StackingClassifier
from sklearn.metrics      import r2_score,accuracy_score

In [None]:
#Creating the different models stack 
estimators = [
    ('lr', LogisticRegression()),
    ('dt', RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=250,
                       n_estimators=50, n_jobs=-1, random_state=42,oob_score=True))
]

In [None]:
#Stacking the models together
stack_reg = StackingClassifier(estimators=estimators)
stack_reg.fit(X_train, y_train)

In [None]:
#Testing the accuracy of the model 
y_train_pred = stack_reg.predict(X_train)
accuracy_score(y_train,y_train_pred)

In [None]:
#Checking the model accuracy on the test dataset 
#Concatenanting the passgerID & Survival rate 
rf_test_pred         = stack_reg.predict(df_test)
submission_file      = pd.DataFrame({'PassengerID':test_id['PassengerId'],'Survived':rf_test_pred})
submission_file.to_csv('submission_4.csv', index=False)

* #### Here we getting the test accuracy of 77.69.
* #### We will also try knn for classifiying.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(4)
Knn.fit(X_train,y_train)

In [None]:
#Testing the accuracy of the model 
y_train_pred = Knn.predict(X_train)
accuracy_score(y_train,y_train_pred)

In [None]:
#Checking the model accuracy on the test dataset 
#Concatenanting the passgerID & Survival rate 
rf_test_pred         = Knn.predict(df_test)
submission_file      = pd.DataFrame({'PassengerID':test_id['PassengerId'],'Survived':rf_test_pred})
submission_file.to_csv('submission_5.csv', index=False)

#### 1. Let's build an ensemble using LogisticRegression, KNN, RandomForest algorithm

In [None]:
#Creating the different models stack 
estimators = [
    ('lr', LogisticRegression()),
    ('dt', RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=250,
                       n_estimators=50, n_jobs=-1, random_state=42,oob_score=True)),
    ('Knn',KNeighborsClassifier(4))
]

In [None]:
#Stacking the models together
stack_reg = StackingClassifier(estimators=estimators)
stack_reg.fit(X_train, y_train)

In [None]:
#Testing the accuracy of the model 
y_train_pred = stack_reg.predict(X_train)
accuracy_score(y_train,y_train_pred)

In [None]:
#Checking the model accuracy on the test dataset 
#Concatenanting the passgerID & Survival rate 
rf_test_pred         = stack_reg.predict(df_test)
submission_file      = pd.DataFrame({'PassengerID':test_id['PassengerId'],'Survived':rf_test_pred})
submission_file.to_csv('submission_6.csv', index=False)