In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import warnings
warnings.filterwarnings('ignore')

import seaborn as sns

# ① Read the training data

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
print(train.shape)
train.head()

In [None]:
train.info()

# ② Preprocess the training data (if necessary)

In [None]:
#결측치
train.isna().sum()

In [None]:
# 중복값
train.duplicated().sum()

In [None]:
# Target distribution, 운송예측 분포
true=train['Transported'].value_counts(True)
print(f'Transported : \n {true}')

In [None]:
# object_columns = oc
# numeric_columns = nc

# 전처리 함수(column 유형별로 리스트 만들기)
def group(data, target, exclude=None):
    if exclude==None:
        exclude = []
    # object columns
    ocs = [a for a in data.columns if data[a].dtype == "object"]
    # numerical columns
    ncs = [a for a in data.columns if data[a].dtype=="float64" or "int64"]
    #feature columns 정의하기  
    all_cols = ocs + ncs
    exc_cols = exclude + [target] 
    feature_cols = [col for col in all_cols if col not in exc_cols]
    
    return ocs, ncs, feature_cols

In [None]:
target="Transported"
ocs, ncs, feature_cols = group(train,target)

**1. ocs 처리**

In [None]:
train[ocs].sample(5)

In [None]:
#ocs의 column명(oc)의 value값과 value개수 확인
for oc in ocs:
    train_unique = train[oc].unique()
    val_counts = train[oc].value_counts()
    
    print(f'{oc} : {train_unique}')
    print(f'length: {len(val_counts)}, {val_counts} \n')

In [None]:
# 전처리 (최빈값으로 대체함)
train['HomePlanet']=train['HomePlanet'].fillna('Earth')
train['Destination']= train['Destination'].fillna('TRAPPIST-1e')

train['CryoSleep']=train['CryoSleep'].fillna(False)
train['VIP']= train['VIP'].fillna(False)

In [None]:
train[ocs].info()

In [None]:
#CryoSleep, VIP column을 binary 값으로 변경
train['CryoSleep']=train['CryoSleep'].astype(int)
train['VIP']=train['VIP'].astype(int)

**2. ncs 처리**

In [None]:
train[ncs].info()

In [None]:
train['Age']= train['Age'].fillna(train['Age'].mean())
train['RoomService']= train['RoomService'].fillna(train['RoomService'].mean())
train['FoodCourt']= train['FoodCourt'].fillna(train['FoodCourt'].mean())
train['ShoppingMall']= train['ShoppingMall'].fillna(train['ShoppingMall'].mean())
train['Spa']= train['Spa'].fillna(train['Spa'].mean())
train['VRDeck']= train['VRDeck'].fillna(train['VRDeck'].mean())

In [None]:
train[ncs].isna().sum()

In [None]:
train.info()

**3. 나머지 결측치 처리**

In [None]:
# Cabin 쪼개기
train[['Grade','Num','PS']]=train['Cabin'].str.split('/',expand=True)

train[['Grade','Num','PS']].info()

In [None]:
# 타입별 column 리스트 다시 만들기

exclude=["Cabin"]
ocs, ncs, feature_cols = group(train,target,exclude)

#ocs의 column명(oc)의 value값과 value개수 확인
for oc in ocs:
    train_unique = train[oc].unique()
    val_counts = train[oc].value_counts()
    
    print(f'{oc} : {train_unique}')
    print(f'length: {len(val_counts)} , {val_counts} \n')

In [None]:
# 전처리 (최빈값으로 대체함, 'Num'은 float으로 타입으로 변경한 뒤 평균값으로 대체)
train['Grade']=train['Grade'].fillna('F')
train['PS']= train['PS'].fillna('S')


train['Num']=train['Num'].astype(float)
train['Num']=train['Num'].fillna(train['Num'].mean())

In [None]:
# passengerid 쪼개기
train[['ID','Group']]=train['PassengerId'].str.split('_',expand=True)

train['ID']=train['ID'].astype(int)
train['Group']=train['Group'].astype(int)

In [None]:
# 불필요한 열 지우기
train.drop(['Cabin','Name'],axis=1,inplace=True)

In [None]:
# attribute의 value를 0,1로 binary하게 변경
dum=['HomePlanet','Destination','PS']
train = pd.get_dummies(train, columns=dum)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le= LabelEncoder()

train['Grade_enc']=le.fit_transform(train['Grade'])

In [None]:
train

In [None]:
train.drop('Grade',axis=1,inplace=True)

In [None]:
train

# ③ Build any classifier you want

**-logistic regression**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# feature selection
plt.figure(figsize=(20,20))
sns.heatmap(train.corr(),annot=True)
plt.xticks(rotation=60)
plt.show()

=> 몇몇개의 음의 상관관계를 가지는 변수가 보이지만 전부 feature로 삼음.

In [None]:
y=train['Transported']
X = train.loc[:,:]
X=X.drop("Transported",axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0, test_size=0.1)

X_train.shape, X_test.shape

In [None]:
model = LogisticRegression(random_state=0).fit(X_train,y_train)

model_pred=model.predict(X_test)

accuracy_score(model_pred,y_test)

In [None]:
print("performance of train data on model:", model.score(X_train, y_train))
print("performance of train_test data on model:", accuracy_score(model_pred,y_test))

# ④ Read the test data

In [None]:
test=pd.read_csv("../input/spaceship-titanic/test.csv")
print(test.shape)
test.head()

In [None]:
test['HomePlanet']=test['HomePlanet'].fillna('Earth')
test['Destination']= test['Destination'].fillna('TRAPPIST-1e')
test['CryoSleep']=test['CryoSleep'].fillna(False).astype(int)
test['VIP']= test['VIP'].fillna(False).astype(int)


test['Age']= test['Age'].fillna(test['Age'].mean())
test['RoomService']= test['RoomService'].fillna(test['RoomService'].mean())
test['FoodCourt']= test['FoodCourt'].fillna(test['FoodCourt'].mean())
test['ShoppingMall']= test['ShoppingMall'].fillna(test['ShoppingMall'].mean())
test['Spa']= test['Spa'].fillna(test['Spa'].mean())
test['VRDeck']= test['VRDeck'].fillna(test['VRDeck'].mean())

In [None]:
test[['Grade','Num','PS']]=test['Cabin'].str.split('/',expand=True)

test['Grade']=test['Grade'].fillna('F')
test['PS']= test['PS'].fillna('S')

test['Num']=test['Num'].astype(float)
test['Num']=test['Num'].fillna(test['Num'].mean())

In [None]:
test[['ID','Group']]=test['PassengerId'].str.split('_',expand=True)

test['ID']=test['ID'].astype(int)
test['Group']=test['Group'].astype(int)

In [None]:
test.drop(['Cabin','Name'],axis=1,inplace=True)

In [None]:
dum=['HomePlanet','Destination','PS']
test = pd.get_dummies(test, columns=dum)

In [None]:
le= LabelEncoder()
test['Grade_enc']=le.fit_transform(test['Grade'])

test.drop('Grade',axis=1,inplace=True)

In [None]:
test

# ⑤ Output a prediction file for the test data

In [None]:
model = LogisticRegression(random_state=0).fit(X_train,y_train)

test_pred = model.predict(test.loc[:,:])

In [None]:
pred_df = pd.DataFrame({'PassengerId':test['PassengerId'],'Transported':test_pred})

In [None]:
pred_df

In [None]:
pred_df.to_csv('./1913397.csv', index=False)