In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#importing the data and overview 

train_df= pd.read_csv('../input/titanic/train.csv')
test_df= pd.read_csv('../input/titanic/test.csv')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df.describe()

In [None]:
train_df['Survived'].value_counts()

# survived = 1
# didn't survive = 0

In [None]:
#EDA

sns.countplot(data=train_df, x='Survived')

# survived = 1
# didn't survive = 0

In [None]:
sns.boxplot(data=train_df, x='Survived', y='Age')

# survived = 1
# didn't survive = 0

In [None]:
sns.scatterplot(data=train_df, x='Sex', y='Age', hue='Survived')

In [None]:
sns.heatmap(train_df.corr(), annot=True)

In [None]:
#Data preparation
#looking for null values and taking care of them

train_df.isnull().sum()

In [None]:
train_df["Age"].fillna(train_df["Age"].mean(), inplace = True)

In [None]:
train_df['Sex']=train_df['Sex'].replace('male', 0)
train_df['Sex']=train_df['Sex'].replace('female', 1)

In [None]:
train_df.drop(['Name', 'PassengerId', 'Fare', 'Ticket','Embarked', 'Cabin'], axis = 1, inplace = True)

In [None]:
train_df.isnull().sum()

In [None]:
missing=train_df.isnull().sum().sort_values(ascending=False)
missing=missing.drop(missing[missing==0].index)
missing

In [None]:
#defining features and the label

X= train_df.drop('Survived', axis=1)
y= train_df['Survived']

In [None]:
#spliting the dataset to work on train and test


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
#feature scaling


from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
scaler.fit(X_train)

In [None]:
scaled_X_train= scaler.transform(X_train)
scaled_X_test= scaler.transform(X_test)

In [None]:
#training the model

from sklearn.neighbors import KNeighborsClassifier

knn_model= KNeighborsClassifier(n_neighbors=1)
knn_model.fit(scaled_X_train, y_train)

In [None]:
#predicting test

y_pred= knn_model.predict(scaled_X_test)

In [None]:
#predicted value VS actual value of test data

pd.DataFrame({'Y_Test':y_test, 'Y_Pred': y_pred})

In [None]:
#testing the model

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
#Elbow method

testing_error_amount= []


for k in range (1, 30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(scaled_X_train, y_train)
    
    y_pred_test = knn_model.predict(scaled_X_test)
    
    test_error=1- accuracy_score(y_test, y_pred_test)
    testing_error_amount.append(test_error)

In [None]:
testing_error_amount

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, 30), testing_error_amount, label='Test Error')
plt.legend()
plt.ylabel('Error Amount')
plt.xlabel('K Value')

In [None]:
#pipeline

scaler= StandardScaler()

knn= KNeighborsClassifier()
knn.get_params().keys()

In [None]:
operations= [('scaler', scaler), ('knn', knn)]

In [None]:
from sklearn.pipeline import Pipeline

pipe= Pipeline(operations)


from sklearn.model_selection import GridSearchCV

k_values= list(range(1, 20))
param_grid= {'knn__n_neighbors': k_values}
full_cv_classifier= GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
full_cv_classifier.fit(X_train, y_train)

In [None]:
full_cv_classifier.best_estimator_.get_params()

In [None]:
full_cv_classifier.cv_results_.keys()

In [None]:
# finalize the model

scaler= StandardScaler()
knn19= KNeighborsClassifier(n_neighbors=19)
operations= [('scaler', scaler), ('knn19', knn19)]

pipe= Pipeline(operations)

pipe.fit(X_train, y_train)

In [None]:
pipe_pred= pipe.predict(X_test)

In [None]:
print(classification_report(y_test, pipe_pred))