# Include the necessary libraries

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
Titanic_data_train=pd.read_csv('../input/spaceship-titanic/train.csv')
Titanic_data_test=pd.read_csv('../input/spaceship-titanic/test.csv')

# Execute mini EDA

In [None]:
print(f'\033[94mNumber of rows in train data: {Titanic_data_train.shape[0]}')
print(f'\033[94mNumber of columns in train data: {Titanic_data_train.shape[1]}')
print(f'\033[94mNumber of values in train data: {Titanic_data_train.count().sum()}')
print(f'\033[94mNumber missing values in train data: {sum(Titanic_data_train.isna().sum())}')

In [None]:
train_miss = pd.DataFrame({
    'missing' : Titanic_data_train.isnull().sum(),
    'ratio' : np.round(Titanic_data_train.isnull().sum()/Titanic_data_train.shape[0],4)*100
})
test_miss = pd.DataFrame({
    'missing' : Titanic_data_test.isnull().sum(),
    'ratio' : np.round(Titanic_data_test.isnull().sum()/Titanic_data_test.shape[0],4)*100
})
train_miss.T

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

width = 0.35
x = np.arange(len(Titanic_data_train.columns[:-1].to_list()))
x_ = sorted(Titanic_data_test.columns.to_list())
y1 = train_miss['missing'][:-1].sort_index()
y1_ = train_miss['ratio'][:-1].sort_index()
y2 = test_miss['missing'].sort_index()
y2_ = test_miss['ratio'].sort_index()

bar1 = ax.bar(x-width/2, y1, width, label="train data", color="cornflowerblue")
bar2 = ax.bar(x+width/2, y2, width, label="test data", color="lightsalmon")
    
ax.set_xticks(x, x_, rotation=30)
ax.set_ylim(0,300)
ax.bar_label(bar1, padding=12, size=12)
ax.bar_label(bar2, padding=12, size=12)
ax.bar_label(bar1, labels=['(%.2f%%)' %y for y in y1_], padding=3, size=10)
ax.bar_label(bar2, labels=['(%.2f%%)' %y for y in y2_], padding=3, size=10)
ax.set_title("Missing Ratio", pad=10, size=20)
ax.legend(loc='best', fontsize=12)

fig.tight_layout()
plt.show()

It was decided to restore the data as follows: in categorical variables we will carry out forecasting using a random forest, in quantitative variables we will replace all values with medians.

In [None]:
Titanic_data_train.describe()

In [None]:
# Figure size
plt.figure(figsize=(6,6))

# Pie plot
Titanic_data_train['Transported'].value_counts().plot.pie(explode=[0.1,0.1], autopct='%1.1f%%', shadow=True, textprops={'fontsize':16}).set_title("Target distribution")

Let's analyze categorical variables and draw conclusions

In [None]:
sns.countplot(x='HomePlanet',hue='Transported',data=Titanic_data_train)

There is an imbalance of classes in determining the dependence of the planet on transportation, the dependence of these variables is possible

In [None]:
sns.countplot(x='CryoSleep',hue='Transported',data=Titanic_data_train)

These variables are clearly correlated.

In [None]:
sns.countplot(x='Destination',hue='Transported',data=Titanic_data_train)

There is little correlation for the planet 55 Cancri e

Let us estimate the distribution of quantitative variables

In [None]:
Titanic_data_train.describe()

In [None]:
plt.figure(figsize = (16,6))

plt.subplot(121)
sns.histplot(data=Titanic_data_train,x='Age',hue='Transported',kde=True)
plt.title('Age Histogram')

plt.show()

The distribution of age values is close to normal, in the remaining variables there is a median value equal to 0. The hypothesis of replacing with zeros was confirmed.

Let's build a heat map to check correlations between variables.

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(Titanic_data_train.corr(),annot=True)
plt.show()

There is a slight negative correlation between the values of the passenger's ability to pay and his chance of transportation.

# Let's start filling in the gaps

Divide the PassengerId variable by the group number and the identification number of the passenger in the group

In [None]:
Id_train=list(Titanic_data_train.PassengerId)
Id1_train=[i.split('_') for i in Id_train]
Gn_train=[int(i[0]) for i in Id1_train]
Titanic_data_train['GroupNumber']=Gn_train

In [None]:
Id_test=list(Titanic_data_test.PassengerId)
Id1_test=[i.split('_') for i in Id_test]
Gn_test=[int(i[0]) for i in Id1_test]
Titanic_data_test['GroupNumber']=Gn_test

Set the passenger index as PassengerID

In [None]:
Titanic_data_train=Titanic_data_train.set_index('PassengerId')
Titanic_data_test=Titanic_data_test.set_index('PassengerId')

In [None]:
y=Titanic_data_train.Transported
Titanic_data_train=Titanic_data_train.drop(['Transported'],axis=1)

In [None]:
Titanic_data_train.shape

In [None]:
Titanic_data_test.shape

Let's connect the tables for more correct filling in the gaps

In [None]:
Titanic_data=pd.concat([Titanic_data_train,Titanic_data_test])

In [None]:
Titanic_data.shape

In [None]:
Titanic_data=Titanic_data.drop(['Name'],axis=1)

In [None]:
Titanic_data.isnull().sum()

### Let's create a table without missing values. This table will help us in predicting missing table values.

In [None]:
With1=Titanic_data.dropna()

Let's translate the Cabin value into 2 signs of interest to us

In [None]:
Cabin=list(With1.Cabin)
Cabin_list=[i.split('/') for i in Cabin]
Deck=[i[0] for i in Cabin_list]
Side=[i[2] for i in Cabin_list]
With1['Deck']=Deck
With1['Side']=Side

In [None]:
With1=With1.drop(['Cabin'],axis=1)

In [None]:
With1_categorical=With1[['HomePlanet','CryoSleep','Destination','Deck','Side','VIP']]
With1_numeric=With1[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]

In [None]:
encoder = OrdinalEncoder()
encoder_train = pd.DataFrame(encoder.fit_transform(With1_categorical),columns=['HomePlanet','CryoSleep','Destination','Deck','Side','VIP'])

In [None]:
names_columns=['HomePlanet','CryoSleep','Destination','Deck','Side','VIP']
for m in names_columns:
    With1[m]=list(encoder_train[m])

#### Great, let's predict the value of HomePlanet for those missing in our general table. To do this, we divide the With1 table into a training, test sample, where the desired value will be HomePlanet

In [None]:
With1=With1.drop(['GroupNumber'],axis=1)

In [None]:
X_Home=With1.drop(['HomePlanet'],axis=1)
y_Home=With1.HomePlanet
X_train, X_test, y_train, y_test = train_test_split(X_Home,
                                                    y_Home,
                                                    test_size=0.05,
                                                    random_state=42)
clf=RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred)}')

Take a sample with missing 'HomePlanet' values from the population

In [None]:
Home_data=Titanic_data[Titanic_data['HomePlanet'].isnull()].drop(['HomePlanet','GroupNumber'],axis=1)

In [None]:
Home_data.Cabin.fillna('B/0/P',inplace=True)
Home_data.CryoSleep.fillna(True,inplace=True)
Home_data.Destination.fillna('TRAPPIST-1e',inplace=True)
Home_data.Age.fillna(25,inplace=True)
Home_data.VIP.fillna(False,inplace=True)
Home_data.RoomService .fillna(100,inplace=True)
Home_data.FoodCourt .fillna(100,inplace=True)
Home_data.ShoppingMall .fillna(100,inplace=True)
Home_data.Spa.fillna(100,inplace=True)
Home_data.VRDeck.fillna(100,inplace=True)

In [None]:
Cabin=list(Home_data.Cabin)
Cabin_list=[i.split('/') for i in Cabin]
Deck=[i[0] for i in Cabin_list]
Side=[i[2] for i in Cabin_list]
Home_data['Deck']=Deck
Home_data['Side']=Side
Home_data=Home_data.drop(['Cabin'],axis=1)

In [None]:
Home_data_categorical=Home_data[['CryoSleep','Destination','Deck','Side','VIP']]
encoder1 = OrdinalEncoder()
encoder_train = pd.DataFrame(encoder1.fit_transform(Home_data_categorical),columns=['CryoSleep','Destination','Deck','Side','VIP'])
names_columns=['CryoSleep','Destination','Deck','Side','VIP']
for m in names_columns:
    Home_data[m]=list(encoder_train[m])

In [None]:
y_pred_Home = clf.predict(Home_data)

In [None]:
Home_data['HomePlanet']=list(y_pred_Home)

In [None]:
HomePlanet=pd.DataFrame(Home_data.HomePlanet)

In [None]:
HomePlanet = HomePlanet.replace({'HomePlanet' : { 0 : 'Earth', 1 : 'Europa', 2:'Mars'}})

In [None]:
Titanic_data.loc[Titanic_data['HomePlanet'].isnull(),'HomePlanet']=list(HomePlanet.HomePlanet)

#### Predicting the value of CryoSleep

In [None]:
X_Home=With1.drop(['CryoSleep'],axis=1)
y_Home=With1.CryoSleep
X_train, X_test, y_train, y_test = train_test_split(X_Home,
                                                    y_Home,
                                                    test_size=0.05,
                                                    random_state=42)
clf=RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred)}')

In [None]:
Cryo_data=Titanic_data[Titanic_data['CryoSleep'].isnull()].drop(['CryoSleep','GroupNumber'],axis=1)

In [None]:
Cryo_data.Cabin.fillna('B/0/P',inplace=True)
Cryo_data.Destination.fillna('TRAPPIST-1e',inplace=True)
Cryo_data.Age.fillna(25,inplace=True)
Cryo_data.VIP.fillna(False,inplace=True)
Cryo_data.RoomService .fillna(100,inplace=True)
Cryo_data.FoodCourt .fillna(100,inplace=True)
Cryo_data.ShoppingMall .fillna(100,inplace=True)
Cryo_data.Spa.fillna(100,inplace=True)
Cryo_data.VRDeck.fillna(100,inplace=True)

In [None]:
Cabin=list(Cryo_data.Cabin)
Cabin_list=[i.split('/') for i in Cabin]
Deck=[i[0] for i in Cabin_list]
Side=[i[2] for i in Cabin_list]
Cryo_data['Deck']=Deck
Cryo_data['Side']=Side
Cryo_data=Cryo_data.drop(['Cabin'],axis=1)

In [None]:
Cryo_data_categorical=Cryo_data[['HomePlanet','Destination','Deck','Side','VIP']]
encoder2 = OrdinalEncoder()
encoder_train = pd.DataFrame(encoder2.fit_transform(Cryo_data_categorical),columns=['HomePlanet','Destination','Deck','Side','VIP'])
names_columns=['HomePlanet','Destination','Deck','Side','VIP']
for m in names_columns:
    Cryo_data[m]=list(encoder_train[m])

In [None]:
y_pred_Crio = clf.predict(Cryo_data)

In [None]:
Titanic_data.loc[Titanic_data['CryoSleep'].isnull(),'CryoSleep']=list(y_pred_Crio)

#### Predicting the value of VIP

In [None]:
X_Home=With1.drop(['VIP'],axis=1)
y_Home=With1.VIP
X_train, X_test, y_train, y_test = train_test_split(X_Home,
                                                    y_Home,
                                                    test_size=0.05,
                                                    random_state=42)
clf=RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred)}')

In [None]:
VIP_data=Titanic_data[Titanic_data['VIP'].isnull()].drop(['VIP','GroupNumber'],axis=1)

In [None]:
VIP_data.Cabin.fillna('B/0/P',inplace=True)
VIP_data.Destination.fillna('TRAPPIST-1e',inplace=True)
VIP_data.Age.fillna(25,inplace=True)
VIP_data.RoomService .fillna(100,inplace=True)
VIP_data.FoodCourt .fillna(100,inplace=True)
VIP_data.ShoppingMall .fillna(100,inplace=True)
VIP_data.Spa.fillna(100,inplace=True)
VIP_data.VRDeck.fillna(100,inplace=True)

In [None]:
Cabin=list(VIP_data.Cabin)
Cabin_list=[i.split('/') for i in Cabin]
Deck=[i[0] for i in Cabin_list]
Side=[i[2] for i in Cabin_list]
VIP_data['Deck']=Deck
VIP_data['Side']=Side
VIP_data=VIP_data.drop(['Cabin'],axis=1)

In [None]:
VIP_data_categorical=VIP_data[['HomePlanet','Destination','Deck','Side','CryoSleep']]
encoder3 = OrdinalEncoder()
encoder_train = pd.DataFrame(encoder3.fit_transform(VIP_data_categorical),columns=['HomePlanet','Destination','Deck','Side','CryoSleep'])
names_columns=['HomePlanet','Destination','Deck','Side','CryoSleep']
for m in names_columns:
    VIP_data[m]=list(encoder_train[m])

In [None]:
y_pred_VIP = clf.predict(VIP_data)
Titanic_data.loc[Titanic_data['VIP'].isnull(),'VIP']=list(y_pred_VIP)

#### Predicting the value of Destination

In [None]:
X_Home=With1.drop(['Destination'],axis=1)
y_Home=With1.Destination
X_train, X_test, y_train, y_test = train_test_split(X_Home,
                                                    y_Home,
                                                    test_size=0.05,
                                                    random_state=42)
clf=RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred)}')

In [None]:
Destination_data=Titanic_data[Titanic_data['Destination'].isnull()].drop(['Destination','GroupNumber'],axis=1)

In [None]:
Destination_data.Cabin.fillna('B/0/P',inplace=True)
Destination_data.Age.fillna(25,inplace=True)
Destination_data.RoomService.fillna(100,inplace=True)
Destination_data.FoodCourt .fillna(100,inplace=True)
Destination_data.ShoppingMall .fillna(100,inplace=True)
Destination_data.Spa.fillna(100,inplace=True)
Destination_data.VRDeck.fillna(100,inplace=True)

In [None]:
Cabin=list(Destination_data.Cabin)
Cabin_list=[i.split('/') for i in Cabin]
Deck=[i[0] for i in Cabin_list]
Side=[i[2] for i in Cabin_list]
Destination_data['Deck']=Deck
Destination_data['Side']=Side
Destination_data=Destination_data.drop(['Cabin'],axis=1)

In [None]:
Destination_data_categorical=Destination_data[['HomePlanet','VIP','Deck','Side','CryoSleep']]
encoder4 = OrdinalEncoder()
encoder_train = pd.DataFrame(encoder4.fit_transform(Destination_data_categorical),columns=['HomePlanet','VIP','Deck','Side','CryoSleep'])
names_columns=['HomePlanet','VIP','Deck','Side','CryoSleep']
for m in names_columns:
    Destination_data[m]=list(encoder_train[m])

In [None]:
y_pred_Destination = clf.predict(Destination_data)
Titanic_data.loc[Titanic_data['Destination'].isnull(),'Destination']=list(y_pred_Destination)

In [None]:
Titanic_data.Destination.unique()

In [None]:
Titanic_data = Titanic_data.replace({'Destination' : { 2.0 : 'TRAPPIST-1e', 1.0 : 'PSO J318.5-22',0.0:'55 Cancri e'}})

Let's replace the values in Age with a median, because the sample is symmetrical with respect to this variable and medians for other quantitative variables, since the sample is not symmetrical about them

In [None]:
Titanic_data.Age.fillna(Titanic_data.Age.median(),inplace=True)
Titanic_data.VRDeck.fillna(Titanic_data.VRDeck.median(),inplace=True)
Titanic_data.Spa.fillna(Titanic_data.Spa.median(),inplace=True)
Titanic_data.ShoppingMall.fillna(Titanic_data.ShoppingMall.median(),inplace=True)
Titanic_data.FoodCourt.fillna(Titanic_data.FoodCourt.median(),inplace=True)
Titanic_data.RoomService.fillna(Titanic_data.RoomService.median(),inplace=True)

#### Delete variables Cabin, Group Number

In [None]:
Titanic_data=Titanic_data.drop(['Cabin'],axis=1)

In [None]:
Titanic_data=Titanic_data.drop(['GroupNumber'],axis=1)

In [None]:
Titanic_data.isnull().sum()

In the general table, we encode the variables HomePlanet, CryoSleep, Destination,VIP

In [None]:
Titanic_data_categorical=Titanic_data[['HomePlanet','VIP','CryoSleep','Destination']]
encoder4 = OrdinalEncoder()
encoder_train = pd.DataFrame(encoder4.fit_transform(Titanic_data_categorical),columns=['HomePlanet','VIP','CryoSleep','Destination'])
names_columns=['HomePlanet','VIP','CryoSleep','Destination']
for m in names_columns:
    Titanic_data[m]=list(encoder_train[m])

Transforming Categorical Variables

In [None]:
HomePlanet=pd.get_dummies(Titanic_data.HomePlanet).add_prefix('HomePlanet')
Titanic_data=Titanic_data.merge(HomePlanet,on='PassengerId')
Titanic_data=Titanic_data.drop(['HomePlanet'],axis=1)

In [None]:
CryoSleep=pd.get_dummies(Titanic_data.CryoSleep).add_prefix('CryoSleep')
Titanic_data=Titanic_data.merge(CryoSleep,on='PassengerId')
Titanic_data=Titanic_data.drop(['CryoSleep'],axis=1)

In [None]:
Destination=pd.get_dummies(Titanic_data.Destination).add_prefix('Destination')
Titanic_data=Titanic_data.merge(Destination,on='PassengerId')
Titanic_data=Titanic_data.drop(['Destination'],axis=1)

Normalizing the general dataset

In [None]:
scaler = preprocessing.MinMaxScaler()
names = Titanic_data.columns
d = scaler.fit_transform(Titanic_data)

scaled_df = pd.DataFrame(d, columns=names)
scaled_df.head()

Divide the population into training and test samples

In [None]:
train=scaled_df[:8693]
Validation=scaled_df[8693:]

In [None]:
train.shape

In [None]:
Validation.shape

In [None]:
Titanic_data_train=pd.read_csv('../input/spaceship-titanic/train.csv')

In [None]:
train['Transported']=list(Titanic_data_train.Transported.map(int))

In [None]:
y=train.Transported
X=train.drop(['Transported'],axis=1)

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score

In [None]:
models = [RandomForestClassifier(), KNeighborsClassifier(), SVC(), LogisticRegression()]
scores = dict()

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)

    print(f'model: {str(m)}')
    print(f'Accuracy_score: {accuracy_score(y_test,y_pred)}')
    print(f'Precission_score: {precision_score(y_test,y_pred)}')
    print(f'Recall_score: {recall_score(y_test,y_pred)}')
    print(f'F1-score: {f1_score(y_test,y_pred)}')
    print('-'*30, '\n')

In [None]:
clf=LogisticRegression()
parametres={'max_iter':[100,200,400,800],
           'n_jobs':[-1]}
grid_search_cv_clf=GridSearchCV(clf,parametres,cv=5)
grid_search_cv_clf.fit(X_train,y_train)
best_clf1=grid_search_cv_clf.best_estimator_
y_pred1=best_clf1.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred1)}')

In [None]:
clf=RandomForestClassifier()
parametres={'n_estimators':[10,20,30],'max_depth':[2,5,7,10],'min_samples_split':[2,4,8],'min_samples_leaf':[1,2,4,8]}
grid_search_cv_clf=GridSearchCV(clf,parametres,cv=5)
grid_search_cv_clf.fit(X_train,y_train)
best_clf2=grid_search_cv_clf.best_estimator_
y_pred2=best_clf2.predict(X_test)
print(f'Accuracy_score: {accuracy_score(y_test,y_pred2)}')

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred2)
roc_auc= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
          label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
y_predicted_prob1=best_clf1.predict_proba(Validation)
y_predicted_prob2=best_clf2.predict_proba(Validation)

In [None]:
summ=(y_predicted_prob1+y_predicted_prob2)/2

In [None]:
class_Survived=[j[0] for j in summ]

In [None]:
Pred_survived=[]
for j in class_Survived:
    if j<0.5:
        Pred_survived.append(1)
    else:
        Pred_survived.append(0)

In [None]:

Tit=pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
submissions = pd.concat([Tit.PassengerId,pd.Series(Pred_survived).map(bool)],axis=1)

In [None]:
submissions=submissions.rename(columns={0:'Transported'})

In [None]:
submissions.to_csv('submissionnew8.csv',index=False)