In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read the Data

In [None]:
df=pd.read_csv("/kaggle/input/titanic/train.csv")

# Understand the data

In [None]:
df.head()

In [None]:
df.info()

# Remove columns which you think is not required for learning 

In [None]:
r=['PassengerId','Name','Ticket','Cabin']
df1=df.drop(r,axis=1).copy()
df1.head()

In [None]:
df1.info()

# Deal with missing data

Age,Cabin and Embarked are missing

In [None]:
df1.isnull().sum()

In [None]:
df1["Embarked"].mode()[0]

In [None]:
# Filling missing Embarked values with most common value

df1["Embarked"]=df1["Embarked"].fillna(df1["Embarked"].mode()[0])

In [None]:
df1["Age"]=df1["Age"].fillna(df1["Age"].mean())

# Encode Categorical values

In [None]:
df1.head(2)

In [None]:
df1.info()

In [None]:
cc=['Sex','Embarked']

# Create Encoded Columns

In [None]:
dummy=pd.get_dummies(df1[cc])
dummy.head()

In [None]:
df1.head()

# Remove CC and replace it with encoded data

In [None]:
df1=df1.drop(cc,axis=1)

In [None]:
df2=pd.concat([df1,dummy],axis=1)
df2.head()


# Divide into features and Target

In [None]:
X=df2.iloc[:,1:]
y=df2.iloc[:,0]

In [None]:
X.head()

In [None]:
y.head()

# Check whether target is balanced or not

In [None]:
y.value_counts()

# Run classification without scaling

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
sn=[]
score=[]
for i in range(1,100):
    X_train, X_test, y_train, y_test = train_test_split(X, y,  stratify=y,  test_size=0.25)  # split
    dtc = tree.DecisionTreeClassifier(random_state=0)# model selection
    dtc.fit(X_train, y_train)# model learning
    sn.append(i)
    score.append(dtc.score(X_test,y_test))

result=pd.DataFrame()
result["SN"]=sn
result["Score"]=score

In [None]:
result.describe()

In [None]:
result.to_csv("r1.csv")

# Use minmax scaler

In [None]:
from sklearn import preprocessing

minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))

X_minmax=minmax_scale.fit_transform(X)

In [None]:
X_minmax

In [None]:
X1=pd.DataFrame(columns=X.columns,data=X_minmax)
X1

In [None]:
sn=[]
score=[]
for i in range(1,100):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X1, y,  stratify=y,  test_size=0.25)
    dtc = tree.DecisionTreeClassifier(random_state=0)
    dtc.fit(X_train, y_train)
    sn.append(i)
    score.append(dtc.score(X_test,y_test))

result=pd.DataFrame()
result["SN"]=sn
result["Score"]=score
print(result)    

In [None]:
result.describe()

In [None]:
result.to_csv("r2.csv")

# Lets use standard scaler and see what happens

In [None]:
from sklearn import preprocessing
scaler=preprocessing.StandardScaler()
X_stdscale=scaler.fit_transform(X)
X_stdscale

In [None]:
X2=pd.DataFrame(columns=X.columns,data=X_stdscale)
X2

In [None]:
from sklearn import metrics
sn=[]
score=[]
#acc=[]
for i in range(1,100):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X2, y,  stratify=y,  test_size=0.25)
    dtc = tree.DecisionTreeClassifier(random_state=0)
    dtc.fit(X_train, y_train)
    sn.append(i)
    score.append(dtc.score(X_test,y_test))
    #y_pred=dtc.predict(X_test)
    #acc.append(metrics.accuracy_score(y_test, y_pred))

result=pd.DataFrame()
result["SN"]=sn
result["Score"]=score
#result["Accuracy"]=acc
print(result)    

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
result.describe()

In [None]:
result.to_csv("r3.csv")

# Deep Learning

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy




In [None]:
X_train.shape

In [None]:
model = Sequential([
    Dense(units=8, input_shape=(10,), activation='relu'),
    Dense(units=8, activation='relu'),
    Dense(units=7, activation='sigmoid')
])

In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.0001), 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)

In [None]:
model.fit(
    x=X_train, 
    y=y_train, 
    batch_size=10, 
    epochs=30, 
    shuffle=True, 
    verbose=2
)

# Predict with model

In [None]:
y_test.values

In [None]:
predictions=model.predict_classes(X_test)
df=pd.DataFrame()
df["Actual"]=y_test.values
df["Predicted"]=predictions
correct_predictions = np.nonzero(predictions == y_test.values)[0]
incorrect_predictions = np.nonzero(predictions != y_test.values)[0]
print(len(correct_predictions)," classified correctly")
print(len(incorrect_predictions)," classified incorrectly")

In [None]:
df

# Random Forest

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=30)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

clf.score(X_test,y_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
y_pred=clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Naive Bayes

In [None]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
nb = GaussianNB()

# Train the model using the training sets
nb.fit(X_train,y_train)

#Predict Output
nb.score(X_test,y_test)

# Support Vector Machine

In [None]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
sv = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
sv.fit(X_train, y_train)

#Predict the response for test dataset
# y_pred = clf.predict(X_test)
nb.score(X_test,y_test)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=2)

# Train the model using the training sets
model.fit(X_train, y_train)

model.score(X_test,y_test)

# Boosting 

In [None]:
#Import Gradient Boosting Classifier model
from sklearn.ensemble import GradientBoostingClassifier

#Create Gradient Boosting Classifier
gb = GradientBoostingClassifier()

#Train the model using the training sets
gb.fit(X_train, y_train)

#Predict the response for test dataset
gb.score(X_test,y_test)