# **Imputer Data**

In [None]:
import numpy as np 
import pandas as pd 

train = pd.read_csv('../input/spaceship-titanic/train.csv')
print(train.shape)
test = pd.read_csv('../input/spaceship-titanic/test.csv')
print(test.shape)
sub = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# **preprocessing**

# 1. **Check features type and missing value**

In [None]:
train.head()

In [None]:
test.head()

In [None]:
miss_train = train.isna().sum().sort_values(ascending = False)
miss_test = test.isna().sum().sort_values(ascending = False)
print(miss_train,'\n')
print(miss_test)

m1 = miss_train.index
m2 = miss_test.index
print(m1,'\n')
print(m2)


# 2. **Imputation of missing values**

In [None]:
#missing values in data imputed using most_frequent
from sklearn.impute import SimpleImputer

features_cols_train = list(m1)

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(train[features_cols_train])
train[features_cols_train] = imputer.transform(train[features_cols_train])

features_cols_test = list(m2)
imputer2 = SimpleImputer(strategy='most_frequent')
imputer2.fit(test[features_cols_test])
test[features_cols_test] = imputer2.transform(test[features_cols_test])

#Check if the input value is successful
suc_val_train = train.isnull().sum()
suc_val_test = test.isnull().sum()
print(suc_val_train,'\n')
print(suc_val_test)

# 3. LabelEncoder

In [None]:
#String features conversion value features and Remove ID and NAME features as it is not easy to convert
from sklearn.preprocessing import LabelEncoder
train1 = train.drop(["PassengerId","Name"] , axis = 1)
test1 = test.drop(["PassengerId","Name"] , axis = 1)

#We can understand what each feature means by looking at the information provided by kaggle.
#We know that the cabin function consists of three pieces of information that need to be disassembled
train1[['deck', 'num', 'side']] = train1['Cabin'].str.split('/', expand=True)
test1[['deck', 'num', 'side']] = test1['Cabin'].str.split('/', expand=True)
train1 = train1.drop('Cabin' , axis = 1)
test1 = test1.drop('Cabin' , axis = 1)

label_cols = ["HomePlanet","CryoSleep","Destination","VIP","deck","num","side"]
def label(train,test,columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] =  LabelEncoder().fit_transform(test[col])
    return train, test

train2 ,test2 = label(train1,test1 ,label_cols)

train2.head()

In [None]:
test2.head()

# 4. Outlier expend removal

In [None]:
import matplotlib
import matplotlib.pyplot as plt
xcols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

for i in range(len(xcols)):
    plt.figure(i)
    plt.scatter(np.arange(len(train2)),train2[xcols[i]])
    plt.xlabel('count')
    plt.ylabel('expend')
    plt.title(xcols[i])
    plt.show

In [None]:
#convert dataframe
df = train2.astype(float)
df.head()
df.describe(include='all')

In [None]:
#Confirm whether the deletion was successful
xcols = ["RoomService","ShoppingMall","Spa","VRDeck"]

print ("Shape Of The Before Ouliers: ",df[xcols].shape)
n=0.8
#IQR = Q3-Q1
IQR = np.percentile(df[xcols],75) - np.percentile(df[xcols],25)
# outlier = Q3 + n*IQR 
transform_data=df[df[xcols] < np.percentile(df[xcols],75)+n*IQR]
# outlier = Q1 - n*IQR 
transform_data=transform_data[transform_data[xcols] > np.percentile(transform_data[xcols],25)-n*IQR][xcols]
print ("Shape Of The After Ouliers: ",transform_data.shape)

for i in xcols:
    median = df[i].median()
    re = df[i].max()
    df.loc[df[i] == df.loc[:,i].max(),i] = median
df.describe(include='all')

In [None]:
train3 = df
train3.info()
for i in range(len(xcols)):
    plt.figure(i)
    plt.scatter(np.arange(len(train3)),train3[xcols[i]])
    plt.xlabel('count')
    plt.ylabel('expend')
    plt.title(xcols[i])
    plt.show

# **5.normally distributed observations**

In [None]:
import seaborn as sns
xcols = list(train3.columns)           
print(xcols)

xcols_features = pd.melt(train3, value_vars=xcols)
print(xcols_features)
g = sns.FacetGrid(xcols_features, col="variable",  col_wrap=5, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
plt.title("feature_all_scatter")
plt.show()

In [None]:
xcols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']          
print(xcols)
test3 = test2.astype(float)
for i in xcols:
    train3[i] = np.log1p(train3[i])
    test3[i] = np.log1p(test3[i])
    sns.distplot(train3[i])
    plt.tight_layout()
    plt.show()
train3.head()

# **training(catboost)**

In [None]:
from sklearn.preprocessing import StandardScaler
X = train3.drop('Transported' , axis =1 )
y = train3['Transported']
X = X.astype('float32')
X.head()

scaler = StandardScaler().fit(X)
X_train_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(test3)

#X_train_scaled = X#scale.transform(X)
#X_test_scaled = test3 #scale.transform(test3)
y=y.astype(int)

from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score

X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.1)

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=10000,eval_metric='F1',verbose=1000, loss_function='Logloss')

model.fit(X_train,y_train,eval_set=(X_val,y_val),use_best_model=True)
#model.fit(X_train_scaled,y,use_best_model=True)
print(model.get_best_iteration())
print(model.random_seed_)
print(model.learning_rate_)
f1 = model.feature_importances_
f_names = model.feature_names_


ans=model.predict(X_test_scaled)
ans=ans.astype(bool)
print(ans)

In [None]:
import shap
import matplotlib
import matplotlib.pyplot as plt

print('Plot feature importances...')
explainer = shap.TreeExplainer(model,feature_names=X.columns)
shap_values = explainer(X_test_scaled)
shap.summary_plot(shap_values, X_test_scaled)


f1 = model.feature_importances_

plt.figure(figsize=(15, 10))
plt.bar(X.columns,f1)
plt.xlabel("feature", fontsize=20)
plt.title("feature coeff", fontsize=25)
plt.show

In [None]:
sub['Transported'] = ans
sub.to_csv('submission.csv', index = False)