In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv('../input/train.csv')

In [None]:
train.head()

# Exploratory Data Analysis


## Missing Data


In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=train,palette='RdBu_r')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train,palette='RdBu_r')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train,palette='rainbow')

In [None]:
sns.distplot(train['Age'].dropna(),kde=False,color='darkred',bins=30)

In [None]:
train['Age'].hist(bins=30,color='darkred',alpha=0.7)

In [None]:
sns.countplot(x='SibSp',data=train)

In [None]:
train['Fare'].hist(color='green',bins=40,figsize=(8,4))

In [None]:
import cufflinks as cf
cf.go_offline()

In [None]:
train['Fare'].iplot(kind='hist',bins=30,color='green')

___
# Data Cleaning


In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [None]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

Now let's check that heat map again!

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
train.drop('Cabin',axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train.dropna(inplace=True)

## Converting Categorical Features 


In [None]:
train.info()

In [None]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

In [None]:
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
train = pd.concat([train,sex,embark],axis=1)

In [None]:
train.head()



# Building a Logistic Regression model



In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived',axis=1), train['Survived'], test_size=0.30, random_state=101)

## Training and Predicting

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
test = pd.read_csv('../input/test.csv')

In [None]:
test.drop('Cabin',axis=1,inplace=True)

In [None]:
test['Age'] = test[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
test['Fare'].fillna((test['Fare'].mean()), inplace=True)

In [None]:
sex = pd.get_dummies(test['Sex'],drop_first=True)
embark = pd.get_dummies(test['Embarked'],drop_first=True)

In [None]:
test.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
test= pd.concat([test,sex,embark],axis=1)

In [None]:
predictions = logmodel.predict(X_test)

Let's move on to evaluate our model!

## Evaluation

We can check precision,recall,f1-score using classification report!

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions))

In [None]:
test_pred = logmodel.predict(test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfmodel = RandomForestClassifier(n_estimators=300)

In [None]:
rfmodel.fit(X_train, y_train)

In [None]:
pred_rf = rfmodel.predict(X_test)

In [None]:
print(classification_report(pred_rf, y_test))

In [None]:
rfmodel.score(X_test, y_test)

In [None]:
pred_results = rfmodel.predict(test)

In [None]:
test['Survived'] = pred_results

In [None]:
submission = test[['PassengerId', 'Survived']]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn.fit(X_train, y_train)

In [None]:
pred = knn.predict(X_test)


In [None]:
print(classification_report(y_test, pred))


In [None]:
error_rate = []

for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=[10,6])
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:

knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

In [None]:
print(classification_report(y_test, pred))


In [None]:
#catboost
train = pd.read_csv('../input/train.csv')
test =  pd.read_csv('../input/test.csv')

In [None]:
train.fillna(-999,inplace=True)
test.fillna(-999,inplace=True)

In [None]:
x = train.drop('Survived',axis=1)
y = train.Survived

In [None]:
x.dtypes

In [None]:
cat_features_index = np.where(x.dtypes != float)[0]

In [None]:
import hyperopt
from catboost import Pool, CatBoostClassifier, cv

In [None]:
model = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=101)

In [None]:
model.fit(X_train,y_train,cat_features=cat_features_index,eval_set=(X_test,y_test))

In [None]:
from sklearn.metrics import accuracy_score
print('the test accuracy is :{:.6f}'.format(accuracy_score(y_test,model.predict(X_test))))

In [None]:
pred = model.predict(test)
pred = pred.astype(np.int)
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})

In [None]:
submission.to_csv('catboost.csv',index=False)
