In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,log_loss,accuracy_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

np.random.seed(0)
sns.set_palette('pastel')

In [95]:
import os

def load_titanic_data(titanic_path =r'D:\machine learning\datasets\titanic',filename = 'train.csv'):
    csv_train_path = os.path.join(titanic_path, filename)
    print(csv_train_path)
    return pd.read_csv(csv_train_path)
train = load_titanic_data()
test = load_titanic_data(filename='test.csv')

D:\machine learning\datasets\titanic\train.csv
D:\machine learning\datasets\titanic\test.csv


In [96]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [97]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [98]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [99]:
train.describe(include = ['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Sutton, Mr. Frederick",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


In [100]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [101]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [102]:
train.shape

(891, 12)

In [103]:
test.shape

(418, 11)

In [104]:
train['Age'].groupby(train['Sex']).mean()


Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

In [105]:
test.groupby("Sex")['Age'].mean()

Sex
female    30.272362
male      30.272732
Name: Age, dtype: float64

In [106]:
for dataset in [train, test]:
    dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)

In [107]:
guess_ages= np.zeros((2,3))
for dataset in [train,test]:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

In [108]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [109]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [110]:
Y = train['Survived']
train = train.drop('Survived',axis=1)
data = pd.concat([train,test],axis=0)

In [111]:
avg_fare = data.groupby('Sex')['Fare'].mean()

In [126]:
test['Fare'].fillna(test['Fare'].dropna().median(), inplace=True)

In [127]:
data = pd.concat([train,test],axis=0)

In [128]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null int32
Age            418 non-null int32
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(1), int32(2), int64(4), object(4)
memory usage: 32.7+ KB


In [129]:
Name_title_data = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Name_title'] = Name_title_data
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_title
0,1,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S,Mr
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S,Mrs
4,5,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S,Mr


In [130]:
age_group_data = [None] * len(data['Age'])
for i in range(len(data['Age'])):
    if data['Age'][i] <= 3:
        age_group_data[i] = 'Baby'
    elif data['Age'][i] >3 and data['Age'][i] <= 13:
        age_group_data[i] = 'Child'
    elif data['Age'][i] >13 and data['Age'][i] <= 19:
        age_group_data[i] = 'Teenager'
    elif data['Age'][i] >19 and data['Age'][i] <= 30:
        age_group_data[i] = 'Young Adult'
    elif data['Age'][i] >30 and data['Age'][i] <= 45:
        age_group_data[i] = 'Middle Aged Adult'
    elif data['Age'][i] >45 and data['Age'][i] <65:
        age_group_data[i] = 'Adult'
    else:
        age_group_data[i] = 'Old'

data['age_group'] = age_group_data

In [131]:
data['Is_Married'] = 0
data['Is_Married'].loc[data['Name_title'] == 'Mrs'] = 1
data['FamSize'] = data['SibSp'] + data['Parch'] + 1
data['Single'] = data['FamSize'].map(lambda s: 1 if s == 1 else 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [119]:
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_title,age_group,Is_Married,FamSize,Single
0,1,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S,Mr,Young Adult,0,2,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C,Mrs,Middle Aged Adult,1,2,0
2,3,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S,Miss,Young Adult,0,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S,Mrs,Middle Aged Adult,1,2,0
4,5,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S,Mr,Middle Aged Adult,0,1,1


In [132]:
np.unique(data['Ticket'])
tic = data.groupby('Ticket',sort=True,group_keys=True)
groups = list(tic.groups)
togther = [None] * len(data['Ticket'])
k=0
for i in range(len(groups)):
    for j in range(len(data['Ticket'])):
        if data['Ticket'][j] == groups[i]:
            togther[j] = i
data['Togther'] = togther

In [121]:
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_title,age_group,Is_Married,FamSize,Single,Togther
0,1,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S,Mr,Young Adult,0,2,0,720
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C,Mrs,Middle Aged Adult,1,2,0,816
2,3,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S,Miss,Young Adult,0,1,1,914
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S,Mrs,Middle Aged Adult,1,2,0,65
4,5,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S,Mr,Middle Aged Adult,0,1,1,649


In [133]:
data['FareBand'] = pd.qcut(data['Fare'],4)
data['FareBand'].unique()

[(-0.001, 7.896], (31.275, 512.329], (7.896, 14.454], (14.454, 31.275]]
Categories (4, interval[float64]): [(-0.001, 7.896] < (7.896, 14.454] < (14.454, 31.275] < (31.275, 512.329]]

In [134]:
    data.loc[ data['Fare'] <= 7.896, 'Fare'] = 0
    data.loc[(data['Fare'] > 7.896) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (dataset['Fare'] <= 31.275), 'Fare']   = 2
    data.loc[ data['Fare'] > 31.275, 'Fare'] = 3
    data['Fare'] = data['Fare'].astype(int)

data = data.drop(['FareBand'], axis=1)

In [137]:
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_title,age_group,Is_Married,FamSize,Single,Togther,Cabin_present
0,1,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,0,,S,Mr,Young Adult,0,2,0,720,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,1,C85,C,Mrs,Middle Aged Adult,1,2,0,816,1
2,3,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,1,,S,Miss,Young Adult,0,1,1,914,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,1,C123,S,Mrs,Middle Aged Adult,1,2,0,65,1
4,5,3,"Allen, Mr. William Henry",0,35,0,0,373450,1,,S,Mr,Middle Aged Adult,0,1,1,649,0


In [135]:
data['Cabin_present'] = 1
data['Cabin_present'].loc[data['Cabin'].isnull()] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [138]:
data = data.drop('Cabin',axis=1)
data = data.drop('Ticket',axis=1)
data = data.drop('Name',axis=1)
data = data.drop('PassengerId',axis=1)

In [139]:
data_ohe = pd.get_dummies(data,drop_first=True)
data_ohe.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Is_Married,FamSize,Single,Togther,...,Name_title_Mrs,Name_title_Ms,Name_title_Rev,Name_title_Sir,age_group_Baby,age_group_Child,age_group_Middle Aged Adult,age_group_Old,age_group_Teenager,age_group_Young Adult
0,3,0,22,1,0,0,0,2,0,720,...,0,0,0,0,0,0,0,0,0,1
1,1,1,38,1,0,1,1,2,0,816,...,1,0,0,0,0,0,1,0,0,0
2,3,1,26,0,0,1,0,1,1,914,...,0,0,0,0,0,0,0,0,0,1
3,1,1,35,1,0,1,1,2,0,65,...,1,0,0,0,0,0,1,0,0,0
4,3,0,35,0,0,1,0,1,1,649,...,0,0,0,0,0,0,1,0,0,0


In [140]:
train_ohe = data_ohe[:train.shape[0]]
test_ohe = data_ohe[train.shape[0]:]

In [142]:
X_train,X_test,Y_train,Y_test = train_test_split(train_ohe,Y,test_size=0.2)

In [143]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(712, 36)
(179, 36)
(712,)
(179,)


In [144]:
params = dict(
    learning_rate = [0.001,0.01,0.1,1,10,100,1000],
    n_estimators = [2,5,10,15,20,25,30,40,50,70,100,125,150,200,300,400,500,700,1000],
    criterion = ['friedman_mse','mse','mae'],
    max_depth = [2,5,10,15,20,25,30,40,50,70,100,125,150,200,300,400,500,700,1000],
    min_samples_leaf = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
)
gbdt = GradientBoostingClassifier()
clf = RandomizedSearchCV(gbdt,params,random_state=0,verbose=0,n_jobs=-1,n_iter=20,cv=10)
gb = clf.fit(X_train,Y_train)
gb.best_params_

{'n_estimators': 700,
 'min_samples_leaf': 8,
 'max_depth': 1000,
 'learning_rate': 0.01,
 'criterion': 'mse'}

In [146]:
gbdt = GradientBoostingClassifier(n_estimators=70,min_samples_leaf=10,max_depth=200,criterion='mse',learning_rate= 0.1)
gbdt.fit(X_train,Y_train)
pred = gbdt.predict(X_test)
acc = accuracy_score(Y_test,pred)*100
print(acc)

83.79888268156425


In [147]:
vc = VotingClassifier(estimators=[('rf', rf), ('gbdt', gbdt)],voting='soft')
vc = vc.fit(X_train,Y_train)

pred = vc.predict(X_test)
acc = accuracy_score(Y_test,pred)*100
print(acc)

NameError: name 'rf' is not defined

In [148]:
params = dict(
    n_estimators = [2,5,10,15,20,25,30,40,50,70,100,125,150,200,300,400,500,700,1000],
    criterion = ['gini','entropy'],
    max_depth = [2,5,10,15,20,25,30,40,50,70,100,125,150,200,300,400,500,700,1000],
    min_samples_leaf = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
)
rf = RandomForestClassifier()
clf = RandomizedSearchCV(rf,params,random_state=0,verbose=0,n_jobs=-1,n_iter=20,cv=10)
rsc = clf.fit(X_train,Y_train)
rsc.best_params_

{'n_estimators': 500,
 'min_samples_leaf': 4,
 'max_depth': 50,
 'criterion': 'entropy'}

In [153]:
rf = RandomForestClassifier(n_estimators=100,min_samples_leaf=3,max_depth=150,criterion='gini')
rf.fit(X_train,Y_train)
pred = rf.predict(X_test)
acc = accuracy_score(Y_test,pred)*100
print(acc)

84.35754189944134


(179,)

In [155]:
vc = VotingClassifier(estimators=[('rf', rf), ('gbdt', gbdt)],voting='soft')
vc = vc.fit(X_train,Y_train)

pred = vc.predict(X_test)
acc = accuracy_score(Y_test,pred)*100
print(acc)

86.03351955307262


In [156]:
predictions = gbdt.predict(test_ohe)
predictions.shape

(418,)

In [157]:
predictions = vc.predict(test_ohe)
predictions.shape

(418,)

In [161]:
predictions = gbdt.predict(test_ohe)

In [162]:
test_Survived = pd.Series(predictions, name="Survived")
IDtest = test["PassengerId"]
results = pd.concat([IDtest,test_Survived],axis=1)
results.to_csv("Submissions4.csv",index=False)