In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train['Type'] = 'Train'
test['Type'] = 'Test'
fulldata = pd.concat([train, test], axis=0, sort=False)

In [3]:
fulldata.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Type'],
      dtype='object')

In [4]:
fulldata.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [5]:
ID_col = ['PassengerId']
target_col = ['Survived']
cat_cols = ['Name','Sex','Ticket','Cabin','Embarked']
other_col = ['Type']
num_cols = list(set(list(fulldata.columns))-set(cat_cols)-set(target_col)-set(ID_col)-set(other_col))

In [6]:
num_cols

['Parch', 'Age', 'Fare', 'Pclass', 'SibSp']

In [7]:
fulldata.isnull().any()

PassengerId    False
Survived        True
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked        True
Type           False
dtype: bool

In [8]:
num_cat_cols = num_cols + cat_cols
for var in num_cat_cols:
    if fulldata[var].isnull().any() == True :
        fulldata[var+'_NA'] = fulldata[var].isnull()*1

In [9]:
fulldata[cat_cols] = fulldata[cat_cols].fillna(value = -9999)
fulldata[num_cols] = fulldata[num_cols].fillna(fulldata[num_cols].mean())

In [10]:
le = LabelEncoder()

for var in cat_cols:
    fulldata[var] = le.fit_transform(fulldata[var].astype('str'))

fulldata['Survived'] = le.fit_transform(fulldata['Survived'].astype('str'))

train = fulldata[fulldata['Type'] == 'Train']
test = fulldata[fulldata['Type'] == 'Test']

train['is_train'] = np.random.uniform(0, 1, len(train)) <= 0.75
Train, Validate = train[train['is_train'] == True], train[train['is_train'] == False]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [11]:
features = list(set(list(fulldata.columns)) -set(ID_col) -set(target_col) -set(other_col))

In [12]:
x_train = Train[list(features)].values
y_train = Train['Survived'].values

x_validate = Validate[list(features)].values
y_validate = Validate['Survived'].values

x_test=test[list(features)].values

In [13]:
random.seed(100)
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
survived = rf.predict_proba(x_validate)
fpr, tpr, _ = metrics.roc_curve(y_validate, survived[:,1])
roc_auc = metrics.auc(fpr, tpr)
print(roc_auc)

0.8764769065520945


In [15]:
final_survived = np.round(rf.predict_proba(x_test)).astype(int)
test['Survived']=final_survived[:,1]
test.to_csv('output.csv',columns=['PassengerId','Survived'], index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
