In [1]:
# Data Analysing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Metrics
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, precision_score, f1_score, plot_roc_curve

## 1. Importing Data

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
test = pd.read_csv("test.csv")
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
len(test)

418

In [7]:
train.drop("Name", axis = 1, inplace = True)


In [8]:
test.drop("Name", axis = 1, inplace = True)

## 2. Fill Missing Values :

In [9]:
def Fill(df):
    for label, content in df.items():
        if not pd.api.types.is_numeric_dtype(content):
     
            # Turn category to number
            df[label] = pd.Categorical(content).codes + 1
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
            
                df[label] = content.fillna(content.mean())
    return df

In [10]:
Fill(train)
Fill(test)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,2,34.50000,0,0,153,7.8292,0,2
1,893,3,1,47.00000,1,0,222,7.0000,0,3
2,894,2,2,62.00000,0,0,74,9.6875,0,2
3,895,3,2,27.00000,0,0,148,8.6625,0,3
4,896,3,1,22.00000,1,1,139,12.2875,0,3
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,2,30.27259,0,0,268,8.0500,0,3
414,1306,1,1,39.00000,0,0,325,108.9000,23,1
415,1307,3,2,38.50000,0,0,347,7.2500,0,3
416,1308,3,2,30.27259,0,0,221,8.0500,0,3


In [11]:
test.isna().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## 3. Splitting Data

In [12]:
x = train.drop("Survived", axis =1)
y = train["Survived"]

In [13]:
x_train,x_valid, y_train, y_valid = train_test_split(x, y, test_size = 0.2)

## 4. Model Fitting

In [14]:
models = {"Logistic Regression" : LogisticRegression(),
         "KNN" : KNeighborsClassifier(),
         "Random Forest" : RandomForestClassifier(),
         "Decision Tree" : DecisionTreeClassifier(),
         "SVC" : SVC(),
         "Discriminant Analysis" : LinearDiscriminantAnalysis()}

def fit_score(models, x_train, x_test, y_train, y_test):
    """
    Fits and evaluate given models 
    """
    np.random.seed(165)
    
    model_score = {}
    
    for name, model in models.items():
        model.fit(x_train, y_train)
        model_score[name]= model.score(x_test, y_test)
    return model_score

In [15]:
fit_score(models = models,
         x_train= x_train,
         x_test = x_valid,
         y_train = y_train,
         y_test = y_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': 0.8324022346368715,
 'KNN': 0.7039106145251397,
 'Random Forest': 0.8715083798882681,
 'Decision Tree': 0.8100558659217877,
 'SVC': 0.776536312849162,
 'Discriminant Analysis': 0.8603351955307262}

## 5. Hyperparameter Tuning
- Logistic Regression
- Random Forest
- Decision Tree

In [16]:
rf = RandomForestClassifier()

rf.fit(x,y)

rf_pre = rf.predict(test)


pred1 = pd.DataFrame()
pred1["PassengerID"] = test["PassengerId"]
pred1["Survived"] = rf_pre

pred1.to_csv("rf_pre.csv", index =False)

In [22]:
rf_grid = {"n_estimators" : np.arange(10,50,10),
          "max_depth" : np.arange(1,32,2),
          "min_samples_split" : np.arange(2,50,2),
          "min_samples_leaf" : np.linspace(0.1, 0.5, 5, endpoint=True),
          "max_features" : list(range(1,30))}

gf_grid = {"n_estimators" : np.arange(10,30,10),
          "max_depth" : np.arange(1,10,5),
          "min_samples_split" : np.arange(2,20,5),
          "min_samples_leaf" : np.linspace(0.1, 0.5, 5, endpoint=True),
          "max_features" : ["auto", "sqrt",10,100]}

In [32]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

rs_model = RandomizedSearchCV(RandomForestClassifier(n_jobs = -1),
                             param_distributions = rf_grid,
                             n_iter = 2,
                             cv = 10,
                             verbose = True)

gs_model = GridSearchCV(RandomForestClassifier(n_jobs = -1,
                                                   random_state = 165),
                             param_grid = gf_grid,
                             cv = 5,
                             verbose = True)

In [24]:
%%time
rs_model.fit(x,y)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
CPU times: total: 234 ms
Wall time: 1.64 s


In [25]:
rs_model.best_params_

{'n_estimators': 10,
 'min_samples_split': 26,
 'min_samples_leaf': 0.2,
 'max_features': 7,
 'max_depth': 5}

In [26]:
ideal_gs = RandomForestClassifier(n_estimators = 4770,
                                   min_samples_split = 328,
                                     min_samples_leaf = 0.1,
                                     max_features = 29,
                                     max_depth = 3,
                                   n_jobs = -1)

In [27]:
rs_model.score(x_valid,y_valid) 

0.8379888268156425

In [32]:
rs_model.best_params_

{'n_estimators': 10,
 'min_samples_split': 326,
 'min_samples_leaf': 0.2,
 'max_features': 16,
 'max_depth': 27}

In [34]:
ideal_gs = RandomForestClassifier(n_estimators = 4770,
                                   min_samples_split = 328,
                                     min_samples_leaf = 0.1,
                                     max_features = 29,
                                     max_depth = 3,
                                   n_jobs = -1)

In [35]:
ideal_gs.fit(x,y)

In [38]:
ideal_rs = RandomForestClassifier(n_estimators = 110,
                                   min_samples_split = 36,
                                     min_samples_leaf = 0.1,
                                     max_features = 3,
                                     max_depth = 30,
                                   n_jobs = -1,
                                   random_state = 165)

In [39]:
ideal_rs.fit(x_train,y_train)

In [40]:
ideal_rs.score(x_valid, y_valid)

0.8379888268156425

In [41]:
rs_model.best_params_

{'n_estimators': 10,
 'min_samples_split': 326,
 'min_samples_leaf': 0.2,
 'max_features': 16,
 'max_depth': 27}

In [42]:
ideal_rs2 = RandomForestClassifier(n_estimators = 960,
                                   min_samples_split = 112,
                                     min_samples_leaf = 0.2,
                                     max_features = 3,
                                     max_depth = 26,
                                   n_jobs = -1,
                                   random_state = 165)

In [43]:
ideal_rs2.fit(x_train,y_train)

In [44]:
ideal_rs2.score(x_valid, y_valid)

0.8379888268156425

In [45]:
pre = pd.read_csv("Predict_rs.csv")
pre.drop("PassengerID",axis=1, inplace =True)

# XGBoost

In [34]:
import xgboost

  from pandas import MultiIndex, Int64Index


In [36]:
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score


In [38]:
xgb = XGBClassifier()

In [39]:
xgb.fit(x,y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




In [55]:
xg_pred= xgb.predict(test)

In [43]:
test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,2,34.50000,0,0,153,7.8292,0,2
1,893,3,1,47.00000,1,0,222,7.0000,0,3
2,894,2,2,62.00000,0,0,74,9.6875,0,2
3,895,3,2,27.00000,0,0,148,8.6625,0,3
4,896,3,1,22.00000,1,1,139,12.2875,0,3
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,2,30.27259,0,0,268,8.0500,0,3
414,1306,1,1,39.00000,0,0,325,108.9000,23,1
415,1307,3,2,38.50000,0,0,347,7.2500,0,3
416,1308,3,2,30.27259,0,0,221,8.0500,0,3


In [56]:
xg_predict = pd.DataFrame()
xg_predict["PassengerID"] = test["PassengerId"]
xg_predict["Survived"] = xg_pred

xg_predict.to_csv("xg_pred.csv", index =False)

In [57]:
xg_predict

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
