데이터 마이닝

In [1]:
# **********
# 데이터 마이닝
# ***********
context = "C:/ezen_tensorflow/seoul_cctv/seoul_cctv/"

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
titanic_df = pd.read_csv(context+"train.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# **********
# 데이터 프로세싱
# ***********
"""파이썬의 함수로 정의"""

'파이썬의 함수로 정의'

In [5]:
def get_titles(df):
    df = df.copy()
    df['title'] = df.Name.str.extract(' ([A-z]+?)\.', expand=True)
    df['title'].replace(
        ['Lady', 'Countess','Capt', 'Col','Don',
         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
        'rare',
        inplace=True)
    df['title'].replace('Mme', 'Mrs', inplace=True)
    df['title'].replace('Ms', 'Miss', inplace=True)
    df['title'].replace('Mlle', 'Miss', inplace=True)
    return df

In [6]:
def preprocess_df(df):
    df = df.copy()
    df['name_len'] = df['Name'].apply(len)
    df['has_cabin'] = df['Cabin'].apply(
    lambda x: 0 if isinstance(x, float) else 1)
    df['not_alone'] = df['SibSp'] | df['Parch']
    df.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True)
    df['Age'].fillna(np.median(df['Age'].dropna()), inplace=True)
    df['Fare'].fillna(np.median(df['Fare'].dropna()), inplace=True)
    df['Embarked'].fillna('S', inplace=True)
    df = pd.get_dummies(df)
    return df

In [7]:
# 데이터 전처리
training_data = preprocess_df(
    pd.read_csv('train.csv', index_col='PassengerId'))
y = training_data.pop('Survived')
X = training_data.values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=10200)

params = dict(max_features=range(5, len(X_train.T), 2),
              max_depth=(None, 20, 10, 5))
rf = RandomForestClassifier(random_state=101, n_estimators=100)
gs_rf = GridSearchCV(estimator=rf, param_grid=params, cv=5).fit(X_train, y_train)
print(gs_rf.best_params_)
print(gs_rf.score(X_test, y_test))


rf_all = RandomForestClassifier(random_state=2101)
gs_rf_all = GridSearchCV(estimator=rf, param_grid=params, cv=5).fit(X, y)
print(gs_rf.best_params_)

test_data = preprocess_df(
    pd.read_csv(context+'test.csv', index_col='PassengerId'))
X_test_proper = test_data.values
test_data.head()

gs_rf_all.predict(X_test_proper)

submission = pd.DataFrame({
        "PassengerId": test_data.index,
        "Survived": gs_rf_all.predict(X_test_proper)
    })
submission.to_csv('submission.csv', index=False)
submission.head()

{'max_depth': 10, 'max_features': 5}
0.776536312849162
{'max_depth': 10, 'max_features': 5}


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
