In [1]:
# 회귀모형 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

# 쓸데없는 알림 방지
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import itertools

#통계적 모형
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor 

#머신러닝
from sklearn import datasets
from sklearn import metrics

from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error #연속형일때 사용하는 경우 
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix #범주형(분류모델)

In [2]:
titanic_train= pd.read_csv('C:/Users/scien/Videos/titanic/titanic_train.csv')
titanic_test = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_test.csv')
titanic_original = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_original.csv')

In [3]:
titanic_test.Name.unique().size

418

In [4]:
titanic_original.name.unique().size

1307

In [5]:
titanic_original.info()

# 이름의 유니크는 1307개 이나 네임컬럼은 1309개 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [6]:
# 따라서 오리지널 이름에 중복되는 내용을 확인하는 duplicated 사용 keep=False로 해줘야
# 중복된 모든 값을 보여줌 
titanic_original.loc[titanic_original.duplicated(['name'], keep=False)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
725,3,1,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,?,Q,13,?,Ireland
726,3,0,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,?,Q,?,?,Ireland
924,3,0,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,?,Q,?,70,?
925,3,0,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,?,S,?,?,?


테스트 셋은 생존 여부가 없고  
오리지널 셋은 페신저 아이디가 없다. (=> 적합해주기 위해서 테스셋의 이름 필드를 기준으로 합치고자 하는데 다행히도 테스트셋의 이름은 겹치는 것이 없다.)

테스트와 트레인셋 각각 중복되는 이름이 따로 따로 저장되어 있다. 

In [7]:
titanic_train.loc[titanic_train['Name'] == 'Connolly, Miss. Kate'][['Name','Age']]

Unnamed: 0,Name,Age
289,"Connolly, Miss. Kate",22.0


In [8]:
titanic_train.loc[titanic_train['Name'] == 'Kelly, Mr. James']['Name']

696    Kelly, Mr. James
Name: Name, dtype: object

In [9]:
titanic_test.loc[titanic_test['Name'] == 'Connolly, Miss. Kate']['Name']

6    Connolly, Miss. Kate
Name: Name, dtype: object

In [10]:
titanic_test.loc[titanic_test['Name'] == 'Kelly, Mr. James']['Name']

0    Kelly, Mr. James
Name: Name, dtype: object

In [11]:
titanic_original.loc[titanic_original.duplicated(['name'], keep='last')]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
725,3,1,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,?,Q,13,?,Ireland
924,3,0,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,?,Q,?,70,?


중복값 쉽게 삭제 drop_duplicates()

그러나 지우고 싶은 데이터를 쉽게 지울 수 없으니 직접 지운다.

In [12]:
drop_idx  = titanic_original.loc[(titanic_original['name'] == 'Connolly, Miss. Kate') & (titanic_original['age'] == '22')].index
titanic_original.drop(drop_idx, inplace=True)

In [13]:
drop_idx  = titanic_original.loc[(titanic_original['name'] == 'Kelly, Mr. James') & (titanic_original['age'] == '44')].index
titanic_original.drop(drop_idx, inplace=True)

# 이름이 ")"가 들어 있어서 수정이 필요함. 
- 아무튼 현재 해결이 불가능하기 때문에 (수작업으로 진행해야함) 

In [14]:
# rename 되었기 때문에 오류가 발생하는 것 무시해
titanic_survived = titanic_original[['survived', 'name']]

In [15]:
titanic_survived.rename(columns = {'survived' : 'Survived', 'name' : 'Name'}, inplace = True)

In [24]:
titanic_sample = pd.merge(left=titanic_test, right=titanic_survived, on='Name')

In [26]:
titanic_sample

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
389,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
390,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
391,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
392,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [25]:
titanic_sample.loc[titanic_sample['Survived'].isna()][['Name']]

Unnamed: 0,Name


In [22]:
titanic_original.loc[titanic_original['name'].str.startswith('Assaf Khalil')]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
647,3,1,"Assaf Khalil, Mrs. Mariana ('Miriam')",female,45,0,0,2696,7.225,?,C,C,?,"Ottawa, ON"


In [21]:
titanic_test.loc[titanic_test['Name'].str.startswith('Assaf Khalil')]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
19,911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45.0,0,0,2696,7.225,,C


In [27]:
titanic_sample = pd.merge(left=titanic_test, right=titanic_original, on='Name')

KeyError: 'Name'

In [28]:
titanic_sample
# 결측치가 존재 

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,...,34.5,0,0,330911,7.8292,?,Q,?,70,?
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,...,47,1,0,363272,7,?,S,?,?,?
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,...,62,0,0,240276,9.6875,?,Q,?,?,"Cambridge, MA"
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,...,27,0,0,315154,8.6625,?,S,?,131,?
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,...,22,1,1,3101298,12.2875,?,S,15,?,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,...,?,0,0,A.5. 3236,8.05,?,S,?,?,?
390,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,...,39,0,0,PC 17758,108.9,C105,C,8,?,?
391,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,...,38.5,0,0,SOTON/O.Q. 3101262,7.25,?,S,?,32,?
392,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,...,?,0,0,359309,8.05,?,S,?,?,?


In [28]:
titanic_raw_data = pd.concat([titanic_train, titanic_sample], axis=0)

In [30]:
titanic_raw_data.to_csv('C:/Users/scien/Videos/titanic/titanic_full_data2.csv', index=False)

In [37]:
titanic_raw_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
389,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
390,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
391,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
392,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
titanic_raw_data = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_raw_data.csv')