# Connect the necessary libraries and load the data

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

In [None]:
Titanic_data_train=pd.read_csv('/kaggle/input/titanic/train.csv')
Titanic_data_test=pd.read_csv('/kaggle/input/titanic/test.csv')

# Let's do a mini exploratory data analysis

Consider each variable separately and draw conclusions on it. Let's start with the ticket class variable.

In [None]:
sns.countplot(x='Pclass',data=Titanic_data_train)

In [None]:
sns.countplot(x='Pclass',hue='Sex',data=Titanic_data_train)

In most cases, the passengers in each class are men. The distribution of women by class is approximately equal. Let's look at the dependence of surviving passengers on their ticket class.

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=Titanic_data_train)

It can be concluded that most of the third class passengers did not survive. We also see that the distribution of surviving passengers is roughly equal by class, as is the distribution of women by class. It is necessary to check the distribution of sexes in survival.

In [None]:
sns.countplot(x='Survived',hue='Sex',data=Titanic_data_train)

CONCLUSION: Women are more likely to survive than men. Let's see the number of passengers with children and wives on board.

In [None]:
sns.countplot(x='SibSp',data=Titanic_data_train)

The graph shows that most of the passengers were single. Let's look at the distribution of surviving passengers depending on their brothers (sisters or wives).

In [None]:
sns.countplot(x='SibSp',hue='Survived',data=Titanic_data_train)

We also have a similar Parch variable that tells us the number of parents or children on board, let's analyze it.

In [None]:
sns.countplot(x='Parch',hue='Survived',data=Titanic_data_train)

We also have a Embarked variable - port of landing, first we analyze it.

In [None]:
sns.countplot(x='Embarked',hue='Survived',data=Titanic_data_train)

Consider the distribution of quantitative variables.

In [None]:
plt.figure(figsize = (16,6))

plt.subplot(121)
sns.histplot(data=Titanic_data_train,x='Age',hue='Survived',kde=True)
plt.title('Age Histogram')

plt.subplot(122)
sns.histplot(data=Titanic_data_train,x='Fare',hue='Survived',kde=True)
plt.title('Fare Histogram')

plt.show()

It can be assumed that with a change in the port of landing, the probability of survival also changes.

Let's build a heat map

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(Titanic_data_train[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].corr(),annot=True)
plt.show()

Remove unnecessary features

In [None]:
Titanic_data_train=Titanic_data_train.drop(['Cabin','Ticket','Name'],axis=1)
Titanic_data_test=Titanic_data_test.drop(['Cabin','Ticket','Name'],axis=1)

Let's replace port values with categories 1,2,3

In [None]:
Titanic_data_train = Titanic_data_train.replace({'Embarked': { 'S' : 1, 'C' : 2,'Q' : 3}})
Titanic_data_test = Titanic_data_test.replace({'Embarked': { 'S' : 1, 'C' : 2,'Q' : 3}})

Let's replace the missing signs of the exit port with the mode since there are very few of them

In [None]:
Titanic_data_train['Embarked'].fillna(int(Titanic_data_train['Embarked'].mode()), inplace=True)

PassengerId is the index of the passenger, convert it to a table index

In [None]:
Titanic_data_train=Titanic_data_train.set_index('PassengerId')
Titanic_data_test=Titanic_data_test.set_index('PassengerId')

Recode the categorical features Sex,Pclass,Embarked

In [None]:
Titanic_data_train = Titanic_data_train.replace({'Sex' : { 'female' : 0, 'male' : 1}})
Titanic_data_test = Titanic_data_test.replace({'Sex' : { 'female' : 0, 'male' : 1}})

In [None]:
Pclass_train=pd.get_dummies(Titanic_data_train.Pclass).add_prefix('Pclass')
Pclass_test=pd.get_dummies(Titanic_data_test.Pclass).add_prefix('Pclass')
Titanic_data_train=Titanic_data_train.merge(Pclass_train,on='PassengerId')
Titanic_data_test=Titanic_data_test.merge(Pclass_test,on='PassengerId')
Titanic_data_train=Titanic_data_train.drop(['Pclass'],axis=1)
Titanic_data_test=Titanic_data_test.drop(['Pclass'],axis=1)

In [None]:
Embarked_train=pd.get_dummies(Titanic_data_train.Embarked).add_prefix('Embarked')
Embarked_test=pd.get_dummies(Titanic_data_test.Embarked).add_prefix('Embarked')
Titanic_data_train=Titanic_data_train.merge(Embarked_train,on='PassengerId')
Titanic_data_test=Titanic_data_test.merge(Embarked_test,on='PassengerId')
Titanic_data_train=Titanic_data_train.drop(['Embarked'],axis=1)
Titanic_data_test=Titanic_data_test.drop(['Embarked'],axis=1)

In [None]:
Titanic_data_train=Titanic_data_train.rename(columns={'Embarked1.0':'Embarked1',
                                                      'Embarked2.0':'Embarked2',
                                                      'Embarked3.0':'Embarked3',})

Let's replace the missing features of the ticket price in the test data with the average since there are very few of them

In [None]:
Titanic_data_test['Fare'].fillna(int(Titanic_data_test['Fare'].mean()), inplace=True)

### Let's replace the missing age values using a simple regression model. To do this, we combine our data together

In [None]:
y=Titanic_data_train.Survived
Titanic_data_train=Titanic_data_train.drop(['Survived'],axis=1)
Age=pd.concat([Titanic_data_train,Titanic_data_test])

In [None]:
Age['Age'].fillna(-1000, inplace=True)

In [None]:
Valid_Age=Age.loc[Age.Age<0]
Data_Age=Age.loc[Age.Age>0]

In [None]:
Valid_Age=Valid_Age.drop(['Age'],axis=1)

In [None]:
Data_Age_X=Data_Age.drop(['Age'],axis=1)
Data_Age_y=Data_Age.Age

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Data_Age_X,
                                                    Data_Age_y,
                                                    test_size=0.1,
                                                    random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error
from math import exp

In [None]:
clf=LinearRegression()
clf.fit(X_train, y_train)
y_val=clf.predict(Valid_Age)
Valid_Age['Age']=list(y_val)

In [None]:
Data=pd.concat([Data_Age,Valid_Age]).sort_index(ascending=True)

Normalizing the general dataset

In [None]:
scaler = preprocessing.MinMaxScaler()
names = Data.columns
d = scaler.fit_transform(Data)

scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Divide our sample again into training and test data

In [None]:
Train_dataset=scaled_df[:891]
Test_dataset=scaled_df[891:]

Let's start predicting the category of survival for passengers, for this we will train the basic forecasting models, then we will select several models that are radically different in nature and make a simple ensemble for averaging the models

In [None]:
X_train, X_test, y_train, y_test=train_test_split(Train_dataset,y,test_size=0.33,random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score

In [None]:
models = [RandomForestClassifier(), KNeighborsClassifier(), SVC(), LogisticRegression()]
scores = dict()

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)

    print(f'model: {str(m)}')
    print(f'Accuracy_score: {accuracy_score(y_test,y_pred)}')
    print(f'Precission_score: {precision_score(y_test,y_pred)}')
    print(f'Recall_score: {recall_score(y_test,y_pred)}')
    print(f'F1-score: {f1_score(y_test,y_pred)}')
    print('-'*30, '\n')

In [None]:
clf=LogisticRegression()
parametres={'max_iter':[100,200,400,800],
           'n_jobs':[-1]}
grid_search_cv_clf=GridSearchCV(clf,parametres,cv=5)
grid_search_cv_clf.fit(X_train,y_train)
best_clf1=grid_search_cv_clf.best_estimator_
y_pred1=best_clf1.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred1)}')

In [None]:
clf=KNeighborsClassifier()
parametres={'leaf_size':[10,20,30,50,100],
            'p':[1,2],
           'n_jobs':[-1]}
grid_search_cv_clf=GridSearchCV(clf,parametres,cv=5)
grid_search_cv_clf.fit(X_train,y_train)
best_clf2=grid_search_cv_clf.best_estimator_
y_pred2=best_clf2.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred2)}')

Let's predict the values for the validation sample, and also get the probabilities of the classes. This is necessary to build a simple ensemble of algorithms that will average the predicted values

In [None]:
y_predicted_prob1=best_clf1.predict_proba(Test_dataset)
y_predicted_prob2=best_clf2.predict_proba(Test_dataset)

In [None]:
summ=(y_predicted_prob1+y_predicted_prob2)/2

In [None]:
class_Survived=[j[0] for j in summ]

In [None]:
Pred_survived=[]
for j in class_Survived:
    if j<0.5:
        Pred_survived.append(1)
    else:
        Pred_survived.append(0)

Let's conduct a small assessment of the functioning of the logistic regression

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred1)
roc_auc= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
          label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

Write the resulting data to a file

In [None]:
Tit=pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
submissions = pd.concat([Tit.PassengerId,pd.Series(Pred_survived)],axis=1)

In [None]:
submissions=submissions.rename(columns={0:'Survived'})

In [None]:
submissions.to_csv('submissionnew4.csv',index=False)