In [1]:
# 회귀모형 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

# 쓸데없는 알림 방지
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import itertools

#통계적 모형
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor 

#머신러닝
from sklearn import datasets
from sklearn import metrics

from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error #연속형일때 사용하는 경우 
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix #범주형(분류모델)

In [2]:
titanic_train= pd.read_csv('C:/Users/scien/Videos/titanic/titanic_train.csv')
titanic_test = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_test.csv')
titanic_original = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_original.csv')

In [3]:
titanic_test.Name.unique().size

418

In [4]:
titanic_original.name.unique().size

1307

In [5]:
titanic_original.info()

# 이름의 유니크는 1307개 이나 네임컬럼은 1309개 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [6]:
# 따라서 오리지널 이름에 중복되는 내용을 확인하는 duplicated 사용 keep=False로 해줘야
# 중복된 모든 값을 보여줌 
titanic_original.loc[titanic_original.duplicated(['name'], keep=False)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
725,3,1,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,?,Q,13,?,Ireland
726,3,0,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,?,Q,?,?,Ireland
924,3,0,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,?,Q,?,70,?
925,3,0,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,?,S,?,?,?


테스트 셋은 생존 여부가 없고  
오리지널 셋은 페신저 아이디가 없다. (=> 적합해주기 위해서 테스셋의 이름 필드를 기준으로 합치고자 하는데 다행히도 테스트셋의 이름은 겹치는 것이 없다.)

테스트와 트레인셋 각각 중복되는 이름이 따로 따로 저장되어 있다. 

In [7]:
titanic_train.loc[titanic_train['Name'] == 'Connolly, Miss. Kate'][['Name','Age']]

Unnamed: 0,Name,Age
289,"Connolly, Miss. Kate",22.0


In [8]:
titanic_train.loc[titanic_train['Name'] == 'Kelly, Mr. James']['Name']

696    Kelly, Mr. James
Name: Name, dtype: object

In [9]:
titanic_test.loc[titanic_test['Name'] == 'Connolly, Miss. Kate']['Name']

6    Connolly, Miss. Kate
Name: Name, dtype: object

In [10]:
titanic_test.loc[titanic_test['Name'] == 'Kelly, Mr. James']['Name']

0    Kelly, Mr. James
Name: Name, dtype: object

In [11]:
titanic_original.loc[titanic_original.duplicated(['name'], keep='last')]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
725,3,1,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,?,Q,13,?,Ireland
924,3,0,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,?,Q,?,70,?


중복값 쉽게 삭제 drop_duplicates()

그러나 지우고 싶은 데이터를 쉽게 지울 수 없으니 직접 지운다.

In [12]:
drop_idx  = titanic_original.loc[(titanic_original['name'] == 'Connolly, Miss. Kate') & (titanic_original['age'] == '22')].index
titanic_original.drop(drop_idx, inplace=True)

In [13]:
drop_idx  = titanic_original.loc[(titanic_original['name'] == 'Kelly, Mr. James') & (titanic_original['age'] == '44')].index
titanic_original.drop(drop_idx, inplace=True)

In [14]:
titanic_original.name.unique().size

1307

In [15]:
# 이제 중복되는 값이 없다 
titanic_original.loc[titanic_original.duplicated(['name'], keep=False)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


#### 이름을 기준으로 합칠 예정이다 

- rename() 컬럼을 dict 타입으로 변경 

In [20]:
# rename 되었기 때문에 오류가 발생하는 것 무시해
titanic_survived = titanic_original[['survived', 'name']]

In [21]:
titanic_original.rename(columns = {'survived' : 'Survived', 'name' : 'Name'}, inplace = True)
titanic_original

Unnamed: 0,pclass,Survived,Name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,?,C,?,328,?
1305,3,0,"Zabour, Miss. Thamine",female,?,1,0,2665,14.4542,?,C,?,?,?
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,?,C,?,304,?
1307,3,0,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,?,C,?,?,?


#### 이름을 기준으로 테스트 셋 레프트  조인하기 

- 현재 문제가 있어 레프트 조인이 아닌 이너 조인으로 바꾼다 레프트 조인문제는 `13_07_02 수정본`으로 확인할 것 

### 또한 문제가 발생하였다.
- 저 수정본이랑 왔다갔다 하느라 샘플본이 저쪽에 있는 바람에 데이터가 꼬였다.
- 그냥 위 내용은 `수정본`가서 확인하고 밑에 내용부터 쭉 확인하길 바람 

In [47]:
titanic_raw_data = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_full_data.csv')

In [48]:
titanic_raw_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1280,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1281,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1282,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1283,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S
